python scrapy爬虫模拟用户登录
2022年5月30日大约 3 分钟约 1001 字
requests
模拟登陆
from http import cookiejar
import requests
session = requests.session() # 长连接
session.cookies = cookiejar.LWPCookieJar(filename="cookies.txt") # 保存到本地
try:
session.cookies.load(ignore_discard=True) # 从本地取出cookies
except:
print("cookie未能加载")
header = { # 模拟浏览器头部
"HOST": "tai.yunhongkg.com",
"Referer": "http://tai.yunhongkg.com/Home/User/login.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
def taijiayuan_login(account, password):
# 肽家园登陆
# 判断是否已经登陆(下次可以自动从文件取出cookies,不需要再次登陆)
inbox_url = "http://tai.yunhongkg.com/Home/User/index.html" # 个人中心页面
response = session.get(inbox_url, headers=header, allow_redirects=False) # 不允许重定向
if response.status_code != 200: # 如果是302跳转到登陆页面,表示没有登陆,开始登陆程序
post_url = "http://tai.yunhongkg.com/index.php?m=Home&c=User&a=do_login"
post_data = {
"username": account,
"password": password
}
response = session.post(post_url, data=post_data, headers=header)
session.cookies.save()
else: # 如果返回200代表可以进入个人中心,已经登陆.
return True
scrapy
模拟登陆
import scrapy
import re
import json
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
header = { # 模拟浏览器头部
"HOST": "tai.yunhongkg.com",
"Referer": "http://tai.yunhongkg.com/Home/User/login.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
def parse(self, response):
pass
def start_requests(self): # 爬虫入口,开始请求
# 获取登陆页面数据
return [scrapy.Request("https://www.zhihu.com/#signin", headers=self.header, callback=self.login)] # 如果不设置callback,默认调用parse函数
def login(self, response):
response_text = response.text
xsrf = ''
match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL)
if match_obj:
xsrf = (match_obj.group(1))
post_url = "https://www.zhihu.com/login/phone_num" # 登陆POST请求链接
post_data = {
"phone_num": "addin",
"password": "pas123",
"_xsrf": xsrf
}
return [scrapy.FormRequest( # post登陆
url=post_url,
formdata=post_data,
headers=self.header,
callback=self.check_login
)]
def check_login(self, response):
# 登陆回调,验证是否登陆成功
text_json = json.loads(response.text)
if "msg" in text_json and text_json["msg"] == "登陆成功":
for url in self.start_urls: # 自己写start_requests函数
yield scrapy.Request(url, dont_filter=True, headers=self.header) # 不写回调默认调用parse函数
selenium
动态网页模拟登陆
建议使用Chrome浏览器
类似框架: splinter
selenium
基本使用
from selenium import webdriver
# 使用360急速浏览器
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.binary_location = r"C:\Users\xiong\AppData\Local\360Chrome\Chrome\Application\360chrome.exe"
browser = webdriver.Chrome(executable_path="C:/chromedriver.exe", chrome_options=chrome_options)
browser.get("https://www.zhihu.com/signup") # 使用浏览器打开网页
print(browser.page_source) # 打印出js渲染后的html代码
# 使用scrapy的css选择器(速度快)
from scrapy.selector import Selector
selector = Selector(text=browser.page_source)
browser.quit()
chrome无界面运行
只能用于Linux系统,不可以运行windows系统
安装:pip install pyvirtualdisplay
from selenium import webdriver
from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 600))
display.start()
browser = webdriver.Chrome(executable_path="C:/chromedriver.exe")
注意: 如果运行时出现
Xvfb文件找不到
,需要安装以下包
apt-get install xvfb
pip install xvfbwrapper
selenium
模拟登陆
browser.get("https://www.zhihu.com/signup")
# 点击登陆按钮
browser.find_element_by_css_selector(".SignContainer-switch > span").click()
# 等待网页加载完毕(如果不等待加载完毕可能会找不到元素)
import time
time.sleep(5)
# 输入用户名
browser.find_element_by_css_selector(".SignFlow-account input").send_keys("15802734189")
# 输入密码
browser.find_element_by_css_selector(".SignFlow-password input").send_keys("Xiong123!@#")
# 点击登陆
browser.find_element_by_css_selector(".Login-content button[type=submit]").click()
selenium
其他操作
# 执行原生javascript代码使用页面下拉
browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
设置chromedriver
不加载图片
# 固定写法
from selenium import webdriver
chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_opt.add_experimental_option("prefs", prefs)
browser = webdriver.Chrome(executable_path="c:/chromedriver.exe", chrome_options=chrome_opt)
scrapy
结合selenium
模拟登陆知乎
>>> vim ArticleSpider/spiders/zhihu.py
import scrapy
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['https://www.zhihu.com/']
header = { # 模拟浏览器头部
"HOST": "tai.yunhongkg.com",
"Referer": "http://tai.yunhongkg.com/Home/User/login.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
def parse(self, response):
pass
def start_requests(self): # 爬虫入口,开始请求
from selenium import webdriver
# 使用360急速浏览器
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.binary_location = r"C:\Users\xiong\AppData\Local\360Chrome\Chrome\Application\360chrome.exe"
browser = webdriver.Chrome(executable_path="C:/chromedriver.exe", chrome_options=chrome_options)
browser.get("https://www.zhihu.com/signup") # 使用浏览器打开网页
# 点击登陆按钮
browser.find_element_by_css_selector(".SignContainer-switch > span").click()
# 输入用户名
browser.find_element_by_css_selector(".SignFlow-account input").send_keys("15802734189")
# 输入密码
browser.find_element_by_css_selector(".SignFlow-password input").send_keys("Xiong123!@#")
# 点击登陆
browser.find_element_by_css_selector(".Login-content button[type=submit]").click()
import time
time.sleep(10)
cookies = browser.get_cookies() # 得到登陆后的cookies
# 把cookies写入到文件和字典
import pickle
cookie_dict = {}
for cookie in cookies:
# 写入文件
file = open("C:/cookies/" + cookie["name"] + ".zhihu", 'wb')
pickle.dump(cookie, file) # 保存单个cookie到文件
file.close()
cookie_dict[cookie['name']] = cookie['value']
browser.close()
# setting.py文件的COOKIES_ENABLED设置为True, cookies会在以后一直发送
return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]