python实现自动抓取某站点内所有超链接
(仅供学习使用)
代码部分
#!/usr/bin/python
import requests
import time
import re
import sys, getopt #命令行选项
from bs4 import BeautifulSoup
localtime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #时间
z=[] #存取网站
x=[] #优化网站,去除冗杂部分
def main(argv):
url = '' #输入的网址
file_path = '' #保存路径
try:
opts, args = getopt.getopt(argv,"hu:f:",["url=","file="])
except getopt.GetoptError:
print ('allsite.py -u <url> -f <file>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h': #帮助
print ('allsite.py -u <url> -f <file>')
sys.exit()
elif opt in ("-u", "--url"): #输入网址
url = arg
re1 = requests.get(url) #get网站内容
re1.encoding = "utf-8"
html = re1.text
bt = BeautifulSoup(html, 'html.parser', )
hh = bt.find_all('a') #查找<a>元素
for site in hh:
z.append(site.get('href')) #进一步过滤得到超链接
for i in z:
if (re.match('//www', str(i)) or re.match('www', str(i))):
xx = str(i).replace('//www', 'www', 1)
x.append(xx)
elif (re.match('http', str(i))): #过滤
x.append(str(i))
elif (re.match('/', str(i))): #过滤
xx = str(i).replace("/", "", 1)
if (re.match('/', xx)):
xxx = str(xx).replace("/", "", 1)
x.append(xxx)
else:
x.append(url + xx)
else: #过滤
if (re.search('javascript', str(i)) == None):
x.append(url + str(i))
print(localtime + " 总共:" + str(len(x)) + "个网址") #输出超链接
for i in x:
print(i)
elif opt in ("-f", "--file"): #输入保存路径
file_path = arg
for i in x: #保存文件
with open(file_path, 'a') as file_object:
file_object.write(i)
file_object.write('\n')
if __name__ == "__main__":
main(sys.argv[1:])
|
|