class ProxyExampleSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def process_spider_input(self, response, spider):
if 200 <= response.status < 300: # common case
return
meta = response.meta
if 'handle_httpstatus_all' in meta:
return
if 'handle_httpstatus_list' in meta:
allowed_statuses = meta['handle_httpstatus_list']
elif self.handle_httpstatus_all:
return
else:
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
if response.status in allowed_statuses:
return
raise HttpError(response, 'Ignoring non-200 response')
def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
spider.crawler.stats.inc_value('httperror/response_ignored_count')
spider.crawler.stats.inc_value(
'httperror/response_ignored_status_count/%s' % response.status
)
logger.info(
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
{'response': response}, extra={'spider': spider},
)
return []
开启自定义spider中间件方式,在配置文件setting.py中添加命名为SPIDER_MIDDLEWARES的字典,其中key为下载器路径,value为优先级,数字越小越靠近引擎,process_spider_input()优先处理,数字越大越靠近spider,process_spider_output()优先处理
class ProxyExampleDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COOKIES_ENABLED'):
raise NotConfigured
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
def process_request(self, request, spider):
if request.meta.get('dont_merge_cookies', False):
return
cookiejarkey = request.meta.get("cookiejar")
jar = self.jars[cookiejarkey]
cookies = self._get_request_cookies(jar, request)
for cookie in cookies:
jar.set_cookie_if_ok(cookie, request)
# set Cookie header
request.headers.pop('Cookie', None)
jar.add_cookie_header(request)
self._debug_cookie(request, spider)
def process_response(self, request, response, spider):
if request.meta.get('dont_merge_cookies', False):
return response
# extract cookies from Set-Cookie and drop invalid/expired cookies
cookiejarkey = request.meta.get("cookiejar")
jar = self.jars[cookiejarkey]
jar.extract_cookies(response, request)
self._debug_set_cookie(response, spider)
return response
def _debug_cookie(self, request, spider):
if self.debug:
cl = [to_native_str(c, errors='replace')
for c in request.headers.getlist('Cookie')]
if cl:
cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)
msg = "Sending cookies to: {}\n{}".format(request, cookies)
logger.debug(msg, extra={'spider': spider})
def _debug_set_cookie(self, response, spider):
if self.debug:
cl = [to_native_str(c, errors='replace')
for c in response.headers.getlist('Set-Cookie')]
if cl:
cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)
msg = "Received cookies from: {}\n{}".format(response, cookies)
logger.debug(msg, extra={'spider': spider})