python scrapy爬虫中间件配置
2022年5月30日小于 1 分钟约 257 字
开启自定义的爬虫中间件
>>> vim ArticleSpider/settings.py
# 去掉注释
SPIDER_MIDDLEWARES = {
'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543,
}
>>> vim ArticleSpider/middlewares.py
class ArticlespiderSpiderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
# 这个函数会被中间件管理器调用
# This method is used by Scrapy to create your spiders.
return cls()
def process_spider_input(self, response, spider):
# 当Engine把Response包发送给Spider时(Engine->Spider),会经过这个函数。
# 应该返回一个空或释放一个异常
return None
def process_spider_output(self, response, result, spider):
# 当Spider处理完当前的Response后,传递给Engine的结果(Item/Request)时,会经过这个函数。
# 已经被Spider处理的Response也会传进来。
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r