#搜索引擎模式,对数据集做分词切分
def data_segment(df):
list=[]
for item in df :
seg = jieba.cut_for_search(item)
seg_str = " ".join(seg)
list.append(seg_str)
dff = pd.DataFrame(list, columns=["context"])
return dff
def text_segment(df, seg_path ):
"""
corpus_path是未分词预料库的路径
seg_path是分词后语料库的路径
"""
list =[]
i = 1
if not os.path.exists(seg_path):
os.makedirs(seg_path)
for item in df:
seg =jieba.cut(item)
seg_str =",".join(seg)
class_path = str(i)+".txt"
savefile(seg_path + class_path, "".join(seg_str))
i+=1
from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法
from sklearn import metrics
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
#读取bunch对象
def _readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch