with open('cutjokes.txt','w',encoding='utf8') as f:
f.write(str(x))
接下来,我们就要用这个进行过分词处理的文件来进行话题提取的工作了
# 导入TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# 导入LDA模型
from sklearn.decomposition import LatentDirichletAllocation
# 用来打印提取后的话题和高频词
def print_topics(model,feature_names,n_top_words):
for topic_idx,topic in enumerate(model.components_):
message='topic #%d:'%topic_idx
message+=' '.join([feature_names for i in topic.argsort()[:-n_top_words-1:-1]])
print(message)
print()