# -*- coding:utf-8 -*-
import numpy as np
import random
import re
__author__ = 'yangxin'
"""
过滤垃圾邮件
"""
class FilterSpam(object):
# 分词操作(匹配任意的非单词字符)
def text_parse(self, big_str):
token_list = re.split(r'\W+', big_str)
if len(token_list) == 0:
print(token_list)
return [tok.lower() for tok in token_list if len(tok) > 2]
# 为单词列表去重
def create_vocab_list(self, data_set):
vocab_set = set()
for item in data_set:
vocab_set = vocab_set | set(item)
return list(vocab_set)
# 标记列表中单词是否出现在输入的数据集中
def set_of_words_to_vec(self, vocab_list, input_set):
result = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
# 如单词在输入文档出现过,则标记为1,否则为0
result[vocab_list.index(word)] = 1
return result