def create_vocab_list(self, data_set):
vocab_set = set()
for item in data_set:
vocab_set = vocab_set | set(item)
# 不含重复元素的单词列表
return list(vocab_set)
def set_of_words2vec(self, vocab_list, input_set):
result = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
# 如单词在输入文档出现过,则标记为1,否则为0
result[vocab_list.index(word)] = 1
return result
def bag_words_to_vec(self, vocab_list, input_set):
result = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
result[vocab_list.index(word)] += 1
else:
print('the word: {} is not in my vocabulary'.format(word))
return result
def testing_naive_bayes(self):
list_post, list_classes = self.load_data_set()
vocab_list = self.create_vocab_list(list_post)
train_mat = []
for post_in in list_post:
train_mat.append(
self.set_of_words_to_vec(vocab_list, post_in)
)
p0v, p1v, p_abusive = self.train_naive_bayes(np.array(train_mat), np.array(list_classes))
test_one = ['love', 'my', 'dalmation']
test_one_doc = np.array(self.set_of_words2vec(vocab_list, test_one))
print('the result is: {}'.format(self.classify_naive_bayes(test_one_doc, p0v, p1v, p_abusive)))
test_two = ['stupid', 'garbage']
test_two_doc = np.array(self.set_of_words2vec(vocab_list, test_two))
print('the result is: {}'.format(self.classify_naive_bayes(test_two_doc, p0v, p1v, p_abusive)))