from numpy import *
import codecs,re
from sklearn.naive_bayes import MultinomialNB
#创建一个包含在所有文档中出现的不重复词的列表
def createVocabList(dadaSet):
vocabset = set([])
for document in dadaSet:
vocabset = vocabset | set(document)
return list(vocabset)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]=1
else:
print("the word:%s is not in my Vocabulary"%word)
return returnVec
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def textParse(bigString): #input is big string, #output is word list
listOfTokens = re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2]