TfidfVectorizer计算TF-IDF 四、实现( 二 )


完整代码
# -*- coding: utf-8 -*from os import listdirfrom re import sub # 字符串正则过滤from collections import Counter # 单词计数import numpy as npimport warningsfrom jieba import cut # 切词from sklearn.naive_bayes import MultinomialNB# 多项式朴素叶贝斯模型from sklearn.metrics import accuracy_score # 测试warnings.filterwarnings('ignore')def getWordsFromFile(file_path):"""对文本进行切词:param file_path: txt文本路径:return: 用空格分词的字符串"""words = []with open(file_path, encoding='utf-8') as fp:for line in fp:line = line.strip()# 过滤干扰字符或者无效字符line = sub(r'[.【】0-9、一 。,!~\*]', '', line)# 使用jieb的cut函数进行分词line = cut(line)# 过滤长度为1的词line = filter(lambda word: len(word) > 1, line)words.extend(line)return wordsdef getWords(file_dir):"""将路径下的所有文件加载:param file_dir: 保存txt文件目录:return: 分词后的文档列表"""words=[]file_list = listdir(file_dir)for file in file_list:file_path = file_dir + '/' + filewords.append(getWordsFromFile(file_path))return wordsdef getTopNWords(words,topN):"""获取出现次数最多的前topN个单词:param words: 需要统计的序列:param topN: 统计的个数:return: 出现次数最多的前topN个单词"""# 因为需要对所有的文本中的单词计数,需要将allWords中的元素(子列表)合并,这里使用了列表推导式实现freq = Counter([x for l in words for x in l])# freq.most_common(topN) 返回[('blue', 3), ('red', 2)] 我们取每个元素的第一个元素即可return [w[0] for w in freq.most_common(topN)]def get_feature(words,topWords):# 获取训练集的特征向量,前600个单词中每个单词在每个邮件中出现的频率"""获取特征向量:param words: 需要获取特征向量的序列:param topWords: topN个单词:return: 特征向量"""features = []for words in words:temp = list(map(lambda x: words.count(x), topWords))features.append(temp)features = np.array(features)return features# 获取训练数据 topN个单词和特征向量train_words_list=getWords('data/train')topWords = getTopNWords(train_words_list,800)train_features=get_feature(train_words_list, topWords)# 获取测试数据 和特征向量test_words_list=getWords('data/test')test_features=get_feature(test_words_list, topWords)# 邮箱标签,1表示垃圾邮件,0表示正常邮件train_labels = np.array([1]*127+[0]*24)test_labels = np.array([1,1,1,1,1,0,0])# 创建叶贝斯模型,使用已有数据进行训练clf = MultinomialNB(fit_prior=False,alpha=0.01).fit(train_features, train_labels)predicted_labels=clf.predict(test_features)# 计算准确率print('训练集精度:',clf.score(train_features, train_labels))# 测试准确率print('预测准确率为:', accuracy_score(test_labels, predicted_labels))
四、实现(计算TF-IDF) 导包
# -*- coding: utf-8 -*from os import listdirimport numpy as npimport warningsfrom jieba import cut# 切词from sklearn.feature_extraction.text import TfidfVectorizer# 计算单词 TF-IDF 向量的值 。from sklearn.naive_bayes import MultinomialNB# 多项式朴素叶贝斯模型from sklearn.metrics import accuracy_score# 测试
切词
def cut_words(file_path):"""对文本进行切词:param file_path: txt文本路径:return: 用空格分词的字符串"""text_with_spaces = ''text = open(file_path, 'r', encoding='UTF-8-sig').read()textcut = cut(text)# 过滤长度为1的词textcut = filter(lambda word: len(word) > 1, textcut)for word in textcut:text_with_spaces += word + ' 'return text_with_spaces
读入数据集
def getWordsFromFile(file_dir):"""将路径下的所有文件加载:param file_dir: 保存txt文件目录:return: 分词后的文档列表"""file_list = listdir(file_dir)words_list = []for file in file_list:file_path = file_dir + '/' + filewords_list.append(cut_words(file_path))return words_listtrain_words_list = getWordsFromFile('data/train')test_words_list = getWordsFromFile('data/test')# 邮箱标签,1表示垃圾邮件,0表示正常邮件train_labels = np.array([1]*127+[0]*24)test_labels = np.array([1, 1, 1, 1, 1, 0, 0])# 读入停止词stop_words = open('data/stop/stopword.txt', 'r', encoding='UTF-8-sig').read()stop_words = stop_words.split('\n')# 根据分隔符分隔