今天工作遇到一个比较繁琐的网站,涉及到了ajax技术和pyspider框架self.crawl函数中,请求参数params的使用。ajax异步请求分析方法与记录1一样,在请求不再采用post请求方式,而是采用get方式请求,所以不用请求体data参数,而是在self.crawl中添加params,详情可参考官方文档或者中文文档(http://www.pyspider.cn/book/pyspider/self.crawl-16.html)。代码相对来说比较简单,但是稍微有些繁琐,所以不详细一步一步讲解了,直接贴代码吧。。
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-06-06 14:10:21
# Project: zhejiang
from pyspider.libs.base_handler import *
from pymongo import MongoClient
import datetime
import re
DB_IP = '127.0.0.1'
DB_PORT = 27017
DB_NAME = 'research'
DB_COL = 'zhejiang'
client = MongoClient(host=DB_IP, port=DB_PORT)
db = client[DB_NAME]
col = db[DB_COL]
class Handler(BaseHandler):
url = 'http://zfxxgk.zj.gov.cn/web1/site/col/col62/index.html'
crawl_config = {
"headers": {
"User-Agent": "Mozilla/5.0 (X11;Linux x86_64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/66.0.3359.181 Safari/537.36"
}
}
def format_date(self, date):
return datetime.datetime.strptime(date, '%Y-%m-%d')
@every(minutes=24 * 60)
def on_start(self):
self.crawl(self.url, fetch_type='js', callback=self.index_page)
@config(age=60)
def index_page(self, response):
page = response.etree
department_list = page.xpath("//td//a/text()")[8:]
department_url = page.xpath("//td//a/@href")[12:]
print len(department_list), len(department_url)
for title, address in zip(department_list, department_url):
print title, address
save = {"depart_url": address,
"depart_title": title
}
# print save["depart_title"]
address = 'http://zfxxgk.zj.gov.cn' + address
self.crawl(address, callback=self.parse_department, fetch_type='js', save=save)
def parse_department(self, response):
page = response.etree
# 翻页
page_num_str = page.xpath("//table[@class='tb_title']/tbody/tr/td[1]/text()")[0].encode('utf-8')
page_num = int(re.findall('共(\d+)页', page_num_str)[0])
base_url = 'http://zfxxgk.zj.gov.cn/xxgk/jcms_files/jcms1' + response.save["depart_url"] + 'zfxxgk/search.jsp?'
# 获取cid
cid = int(page.xpath("//form[@id='searchform']/input[@name='cid']/@value")[0])
print cid
# 获取jdid
jdid = int(page.xpath("//form[@id='searchform']/input[@name='jdid']/@value")[0])
print jdid
# 获取divid
divid = page.xpath("//form[@id='searchform']/input[@name='divid']/@value")[0]
print divid
# 请求参数
params = {"showsub": 0,
"orderbysub": 0,
"cid": cid,
"vc_title": "",
"vc_number": "",
"binlay": "",
"c_issuetime": "",
"jdid": jdid,
"divid": divid,
"vc_keyword": "",
"vc_abs": "",
"vc_ztfl": "",
"vc_service": "",
"c_createtime": ""
}
save = {"categories": response.save["depart_title"]}
for each in range(1, page_num + 1):
page_url = base_url + 'currpage={}&'.format(each)
# print page_url
self.crawl(page_url, callback=self.parse_page, params=params, save=save)
def parse_page(self, response):
page = response.etree
categories = [response.save["categories"]]
content_list = page.xpath("//tr[@class='tr_main_value_odd' or @class='tr_main_value_even']")
for each in content_list:
content_title = each.xpath("./td[1]/a/text()")[0].encode('utf-8')
content_url = each.xpath("./td[1]/a/@href")[0]
content_date = each.xpath("./td[2]/text()")[0]
print content_title, content_url, content_date
save = {"title": content_title,
"url": content_url,
"date": content_date, ### 在这里不要格式化日期,因为save数据在传输的时候会被序列化,到下个函数再用的时候,会变成字符串
"categories": categories
}
self.crawl(content_url, callback=self.parse_body, save=save)
def parse_body(self, response):
page = response.etree
body_list = page.xpath("//text()")
body = ''
for each in body_list:
body += each.strip().encode('utf-8')
result = {"title": response.save["title"],
"categories": response.save["categories"],
"date": self.format_date(response.save["date"]),
"url": response.save["url"],
"body": body,
"update_time": datetime.datetime.now(),
"source": "浙江省人民政府"
}
yield result
def on_result(self, result):
if result is None:
return
# print result
update_key = {
'date': result['date'],
'title': result['title']
}
col.update(update_key, {'$set': result}, upsert=True)
---------------------
作者:Never-Giveup
来源:CSDN
原文:https://blog.csdn.net/qq_36653505/article/details/80600497
版权声明:本文为博主原创文章,转载请附上博文链接!
|
|