TfidfVectorizer计算TF-IDF 四、实现( 三 )


计算TF-IDF 生成特征向量
# 计算单词权重tf = TfidfVectorizer(stop_words=stop_words, max_df=0.5)train_features = tf.fit_transform(train_words_list)# 上面fit过了,这里transformtest_features = tf.transform(test_words_list)
模型学习和结果预测
# 多项式贝叶斯分类器clf = MultinomialNB(fit_prior=False, alpha=0.01).fit(train_features, train_labels)predicted_labels = clf.predict(test_features)# 计算准确率print('训练集精度:', clf.score(train_features, train_labels))# 测试准确率print('预测准确率为:', accuracy_score(test_labels, predicted_labels))
完整代码
【TfidfVectorizer计算TF-IDF四、实现】# -*- coding: utf-8 -*from os import listdirimport numpy as npimport warningsfrom jieba import cut# 切词from sklearn.feature_extraction.text import TfidfVectorizer# 计算单词 TF-IDF 向量的值 。from sklearn.naive_bayes import MultinomialNB# 多项式朴素叶贝斯模型from sklearn.metrics import accuracy_score# 测试warnings.filterwarnings('ignore')def cut_words(file_path):"""对文本进行切词:param file_path: txt文本路径:return: 用空格分词的字符串"""text_with_spaces = ''text = open(file_path, 'r', encoding='UTF-8-sig').read()textcut = cut(text)# 过滤长度为1的词textcut = filter(lambda word: len(word) > 1, textcut)for word in textcut:text_with_spaces += word + ' 'return text_with_spacesdef getWordsFromFile(file_dir):"""将路径下的所有文件加载:param file_dir: 保存txt文件目录:return: 分词后的文档列表"""file_list = listdir(file_dir)words_list = []for file in file_list:file_path = file_dir + '/' + filewords_list.append(cut_words(file_path))return words_listtrain_words_list = getWordsFromFile('data/train')test_words_list = getWordsFromFile('data/test')# 邮箱标签,1表示垃圾邮件,0表示正常邮件train_labels = np.array([1]*127+[0]*24)test_labels = np.array([1, 1, 1, 1, 1, 0, 0])# 读入停止词stop_words = open('data/stop/stopword.txt', 'r', encoding='UTF-8-sig').read()stop_words = stop_words.split('\n')# 根据分隔符分隔# 计算单词权重tf = TfidfVectorizer(stop_words=stop_words, max_df=0.5)train_features = tf.fit_transform(train_words_list)# 上面fit过了,这里transformtest_features = tf.transform(test_words_list)# 多项式贝叶斯分类器clf = MultinomialNB(fit_prior=False, alpha=0.01).fit(train_features, train_labels)predicted_labels = clf.predict(test_features)# 计算准确率print('训练集精度:', clf.score(train_features, train_labels))# 测试准确率print('预测准确率为:', accuracy_score(test_labels, predicted_labels))
五、知识点查漏补缺 re.sub函数
list.和list.的区别
list.() 向列表中添加一个对象
list.() 把一个序列seq的内容添加到列表中
示例:
music_media = ['compact disc', '8-track tape', 'long playing record']new_media = ['DVD Audio disc', 'Super Audio CD']music_media.append(new_media)>>>['compact disc', '8-track tape', 'long playing record', ['DVD Audio disc', 'Super Audio CD']]music_media.extend(new_media)>>>['compact disc', '8-track tape', 'long playing record', 'DVD Audio disc', 'Super Audio CD']
.类
是包提供的字典计数器
简单例子:
#统计词频 手写版colors = ['red', 'blue', 'red', 'green', 'blue', 'blue']result = {}for color in colors:if result.get(color)==None:result[color]=1else:result[color]+=1print(result)#{'red': 2, 'blue': 3, 'green': 1}# 统计词频 Counter版本from collections import Countercolors = ['red', 'blue', 'red', 'green', 'blue', 'blue']c=Counter(colors)print(dict(c))print(c.most_common(2))