python scrapy爬虫数据保存详解
2022年5月30日大约 1 分钟约 447 字
把item数据导出到json文件中
vim ArticleSpider/pipelines.py
import codecs # 文件操作模块
import json
# 把item保存到json文件
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file = codecs.open("article.json", 'w', encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n" # 关闭ascii保存,因为有中文
self.file.write(lines)
return item
def spider_closed(self, spider):
# 当爬虫关闭时
self.file.close()
注册到item管道配置中
vim ArticleSpider/settings.py
ITEM_PIPELINES = {
'...',
'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
}
使用自带的模块导出json文件
>>> vim ArticleSpider/pipelines.py
from scrapy.exporters import JsonItemExporter
class JsonExporterPipeline(object):
# 调用scrapy提供的json export导出json文件
def __init__(self):
self.file = open("articleexport.json", "wb")
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
注册到item管道配置中
vim ArticleSpider/settings.py
ITEM_PIPELINES = {
'...',
'ArticleSpider.pipelines.JsonExporterPipeline': 2,
}
使用mysql保存
# 安装mysql驱动
>>> pip install mysqlclient
# centos需要另外安装驱动
>>> sudo yum install python-devel mysql-devel
使用同步的机制写入mysql
import MySQLdb
class MysqlPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('dongfe.com', 'root', 'Xiong123!@#', 'article_spider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into article(title, url, add_time, star)
values (%s, %s, %s, %s)
"""
self.cursor.execute(insert_sql, (item['title'], item['url'], item['add_time'], item['star']))
self.conn.commit()
使用异步的机制写入mysql
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class MysqlTwistedPipline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
pass # 这个方法会把settings文件传进来
dbparms = dict(
host="dongfe.com",
db="article_spider",
user="root",
passwd="Xiong123!@#",
charset="utf8",
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
# 使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) # 处理异常
def do_insert(self, cursor, item):
# 执行具体的插入
insert_sql = """
insert into article(title, url, add_time, star)
values (%s, %s, %s, %s)
"""
cursor.execute(insert_sql, (item['title'], item['url'], item['add_time'], item['star']))
def handle_error(self, failure, item, spider):
# 处理异步插入的异常
print(failure)