网上搜的过滤敏感词的一段代码,测试后发现在过滤不干净。各位大神有空帮看下哪里的问题吧。
[Python] 纯文本查看 复制代码 import time
time1=time.time()
# AC自动机算法
class node(object):
def __init__(self):
self.next = {}
self.fail = None
self.isWord = False
self.word = ""
class ac_automation(object):
def __init__(self):
self.root = node()
# 添加敏感词函数
def addword(self, word):
temp_root = self.root
for char in word:
if char not in temp_root.next:
temp_root.next[char] = node()
temp_root = temp_root.next[char]
temp_root.isWord = True
temp_root.word = word
# 失败指针函数
def make_fail(self):
temp_que = []
temp_que.append(self.root)
while len(temp_que) != 0:
temp = temp_que.pop(0)
p = None
for key,value in temp.next.item():
if temp == self.root:
temp.next[key].fail = self.root
else:
p = temp.fail
while p is not None:
if key in p.next:
temp.next[key].fail = p.fail
break
p = p.fail
if p is None:
temp.next[key].fail = self.root
temp_que.append(temp.next[key])
# 查找敏感词函数
def search(self, content):
p = self.root
result = []
currentposition = 0
while currentposition < len(content):
word = content[currentposition]
while word in p.next == False and p != self.root:
p = p.fail
if word in p.next:
p = p.next[word]
else:
p = self.root
if p.isWord:
result.append(p.word)
p = self.root
currentposition += 1
return result
# 加载敏感词库函数
def parse(self, path):
with open(path,encoding='utf-8') as f:
for keyword in f:
self.addword(str(keyword).strip())
# 敏感词替换函数
def words_replace(self, text):
"""
:param ah: AC自动机
:param text: 文本
:return: 过滤敏感词之后的文本
"""
result = list(set(self.search(text)))
for x in result:
m = text.replace(x, '*' * len(x))
text = m
return text
if __name__ == '__main__':
ah = ac_automation()
path='e:/baidu_filter.txt'
ah.parse(path)
filename="e:/lbj.txt"
fp=open(filename,'r')
data=fp.read()
text1=data
text2=ah.words_replace(text1)
rs=open("e:/rs.txt","w")
rs.write(text2)
rs.close()
print(text1)
print(text2)
time2 = time.time()
下面是词库 [词库](链接:pan.baidu.com/s/1QYviLOHIkxKiDUYCf... 提取码:tf5q 复制这段内容后打开百度网盘手机 App,操作更方便哦–来自百度网盘超级会员 V4 的分享 “词库”)
这里是原文链接:pan.baidu.com/s/193rF_C7fg5W_hFR6T...
提取码:xojo
原文里的 非典 两个词不知道为什么 过滤不掉
————————————————
原文作者:70h_org
转自链接:https://learnku.com/python/t/51555
|