12-Python爬虫高阶:模拟登录与验证码处理

很多数据需要登录后才能访问,这篇教你搞定各种登录验证和验证码!

1. POST表单登录(直接提交账号密码)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import requests

session = requests.Session()

# 登录页面URL
login_url = 'https://example.com/login'

# 登录参数
data = {
'username': 'your_account',
'password': 'your_password',
'remember': 'on'
}

# 发送POST请求登录
response = session.post(login_url, data=data)

# 检查登录结果
if response.status_code == 200:
print('✅ 登录成功!')
else:
print(f'❌ 登录失败,状态码: {response.status_code}')

# 登录后访问其他页面(会自动带上Cookie)
profile = session.get('https://example.com/profile')
print(profile.text)

2. 带请求头的登录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests

session = requests.Session()

# 请求头(模拟浏览器)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://example.com/login',
}

login_url = 'https://example.com/login'
data = {
'username': 'your_account',
'password': 'your_password',
}

response = session.post(login_url, data=data, headers=headers)

if '退出' in response.text or '欢迎您' in response.text:
print('✅ 登录成功!')
else:
print('❌ 登录失败')

3. 先获取Cookie再登录

有些网站需要先访问首页获取Cookie:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests

session = requests.Session()

# 1. 先访问首页,获取初始Cookie
session.get('https://example.com/')

# 2. 再提交登录信息
login_url = 'https://example.com/doLogin'
data = {
'username': 'your_account',
'password': 'your_password',
}

response = session.post(login_url, data=data)

print(response.text)

4. 使用已保存的Cookie

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import requests
import json

session = requests.Session()

# 保存Cookie到文件
def save_cookies(session, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(session.cookies.get_dict(), f)

# 加载Cookie
def load_cookies(session, filename):
with open(filename, 'r', encoding='utf-8') as f:
cookies = json.load(f)
session.cookies.update(cookies)

# 使用Cookie访问
load_cookies(session, 'cookies.json')
response = session.get('https://example.com/profile')
print(response.text)

二、Selenium模拟登录

1. 基础版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

def login_website(url, username, password):
"""模拟登录网站"""

options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options

try:
driver.get(url)
time.sleep(2)

# 输入账号
driver.find_element(By.NAME, 'username').send_keys(username)

# 输入密码
driver.find_element(By.NAME, 'password').send_keys(password)

# 点击登录
driver.find_element(By.CLASS_NAME, 'login-btn').click()

# 等待登录完成
time.sleep(3)

# 检查是否登录成功
if '退出' in driver.page_source:
print('✅ 登录成功!')
return driver
else:
print('❌ 登录失败')
return None

except Exception as e:
print(f'❌ 登录出错: {e}')
return None

# 使用
driver = login_website('https://example.com/login', 'user', 'pass')

2. 登录后保持Session

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pickle # 用于保存cookies

def login_and_save_cookies(url, username, password, cookie_file):
"""登录并保存Cookies"""

driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=Options().add_argument('--headless')
)

try:
# 登录
driver.get(url)
# ... 输入账号密码点击登录 ...

# 保存cookies
pickle.dump(driver.get_cookies(), open(cookie_file, 'wb'))
print('✅ Cookies已保存!')

return driver

finally:
pass # 不关闭,继续用

# 加载Cookies
def load_cookies_and_visit(url, cookie_file):
driver = webdriver.Chrome(...)
driver.get(url)

# 加载cookies
cookies = pickle.load(open(cookie_file, 'rb'))
for cookie in cookies:
driver.add_cookie(cookie)

driver.refresh() # 刷新页面
return driver

三、验证码处理

1. 图片验证码(Tesseract识别)

安装Tesseract

1
2
3
4
5
# Windows: 下载安装包
# https://github.com/UB-Mannheim/tesseract/wiki

# 安装Python库
pip install pytesseract pillow

识别验证码

1
2
3
4
5
6
7
8
9
10
11
12
from PIL import Image
import pytesseract

# 配置Tesseract路径(Windows需要)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# 打开图片
image = Image.open('captcha.png')

# 识别
text = pytesseract.image_to_string(image)
print(f'识别结果: {text}')

预处理提高识别率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract

def recognize_captcha(image_path):
"""预处理后识别验证码"""

image = Image.open(image_path)

# 转为灰度图
image = image.convert('L')

# 二值化处理
threshold = 128
image = image.point(lambda p: 255 if p > threshold else 0)

# 去噪点
image = image.filter(ImageFilter.MedianFilter())

# 增强对比度
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2)

# 保存处理后的图片(可查看效果)
image.save('processed_captcha.png')

# 识别
text = pytesseract.image_to_string(image, config='--psm 6')

return text.strip()

# 使用
result = recognize_captcha('captcha.png')
print(f'验证码: {result}')

简单验证码识别实战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import requests
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pytesseract

def login_with_captcha(url, username, password):
"""带验证码的登录"""

driver = webdriver.Chrome(...)

try:
driver.get(url)

# 下载验证码图片
captcha_element = driver.find_element(By.ID, 'captcha-img')
captcha_url = captcha_element.get_attribute('src')

# 保存验证码
captcha_response = requests.get(captcha_url)
with open('captcha.png', 'wb') as f:
f.write(captcha_response.content)

# 识别验证码
captcha_text = recognize_captcha('captcha.png')
print(f'识别到验证码: {captcha_text}')

# 输入账号密码和验证码
driver.find_element(By.NAME, 'username').send_keys(username)
driver.find_element(By.NAME, 'password').send_keys(password)
driver.find_element(By.NAME, 'captcha').send_keys(captcha_text)

# 点击登录
driver.find_element(By.CLASS_NAME, 'login-btn').click()

time.sleep(2)
return driver

except Exception as e:
print(f'出错: {e}')
return None

2. 第三方打码平台(推荐)

对于复杂验证码,使用第三方平台识别率更高:

超级鹰

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 注册超级鹰:http://www.chaojiying.com/

import requests
import base64

def recognize_with_chaojiying(image_path, user, password, soft_id):
"""使用超级鹰识别验证码"""

# 读取图片并base64编码
with open(image_path, 'rb') as f:
img_data = base64.b64encode(f.read()).decode('utf-8')

# 请求参数
data = {
'user': user,
'pass': password,
'softid': soft_id,
'imgdata': img_data,
}

# 请求识别
response = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=data)
result = response.json()

if result.get('err_no') == 0:
return result.get('pic_str')
else:
print(f'识别失败: {result}')
return None

# 使用
code = recognize_with_chaojiying('captcha.png', '用户名', '密码', '软件ID')
print(f'验证码: {code}')

阿里云验证码服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 阿里云验证码OCR
from alibabacloud_cr20220701.client import Client
from alibabacloud_tea_openapi.models import Config

config = Config(
access_key_id='你的ID',
access_key_secret='你的密钥',
endpoint='ocr.cn-shanghai.aliyuncs.com'
)

client = Client(config)

body = bytes(open('captcha.png', 'rb').read())

response = client.predict_body({
'image': body
})
print(response.body)

四、滑块验证码处理

滑块验证码需要模拟人类滑动轨迹:

1. 基本思路

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random

def move_to_slider(driver, slider_element, distance):
"""
移动滑块到目标位置
distance: 需要滑动的距离
"""

# 点击滑块
slider_element.click()

# 生成滑动轨迹
tracks = get_slide_tracks(distance)

# 按轨迹滑动
ActionChains(driver).click_and_hold(slider_element).perform()

for x in tracks:
ActionChains(driver).move_by_offset(x=x, y=random.randint(-2, 2)).perform()
time.sleep(random.randint(10, 30) / 1000)

# 释放
ActionChains(driver).release().perform()

def get_slide_tracks(distance):
"""
生成滑动轨迹
模拟人类滑动:先快后慢,带有抖动
"""
tracks = []
current = 0
mid = distance * 0.7 # 加速到70%位置
t = random.randint(10, 15) # 随机时间

while current < distance:
if current < mid:
# 加速阶段
move = random.randint(5, 10)
else:
# 减速阶段
move = random.randint(2, 5)

current += move
tracks.append(move)

# 补充剩余距离
if current < distance:
tracks.append(distance - current)

return tracks

2. 完整滑块验证处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
import time
import random
import requests
import numpy as np

def get_distance(bg_image_url, slider_image_url):
"""计算缺口距离"""

# 下载图片
bg_data = requests.get(bg_image_url).content
slider_data = requests.get(slider_image_url).content

# 保存
with open('bg.jpg', 'wb') as f:
f.write(bg_data)
with open('slider.png', 'wb') as f:
f.write(slider_data)

# 打开图片
bg = Image.open('bg.jpg')
slider = Image.open('slider.png')

# 转灰度
bg_gray = bg.convert('L')
slider_gray = slider.convert('L')

# 简单缺口检测
bg_array = np.array(bg_gray)
slider_array = np.array(slider_gray)

# 计算差值
min_diff = float('inf')
best_x = 0

for x in range(bg_array.shape[1] - slider_array.shape[1]):
diff = np.sum(np.abs(bg_array[:, x:x+slider_array.shape[1]] - slider_array))
if diff < min_diff:
min_diff = diff
best_x = x

return best_x

def slide_verify(driver, url):
"""处理滑块验证"""

driver.get(url)
time.sleep(2)

try:
# 等待滑块出现
slider = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CLASS_NAME, 'slider-btn'))
)

# 获取背景图和滑块图URL
bg_url = driver.find_element(By.CLASS_NAME, 'bg-image').get_attribute('src')
slider_url = driver.find_element(By.CLASS_NAME, 'slider-image').get_attribute('src')

# 计算距离
distance = get_distance(bg_url, slider_url)
print(f'需要滑动的距离: {distance}')

# 滑动
move_to_slider(driver, slider, distance)

# 等待验证结果
time.sleep(2)

# 检查是否验证成功
if '验证通过' in driver.page_source:
print('✅ 滑块验证通过!')
return True
else:
print('❌ 滑块验证失败')
return False

except Exception as e:
print(f'滑块验证出错: {e}')
return False

五、点选验证码

点选验证码需要识别图片中的特定元素:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

def click_points(driver, xpath_template, points):
"""
点击指定坐标
points: [(x1, y1), (x2, y2), ...]
"""
for i, (x, y) in enumerate(points):
# 先点击验证码区域(确保坐标正确)
canvas = driver.find_element(By.XPATH, xpath_template.format(i))

# 获取元素位置和大小
location = canvas.location
size = canvas.size

# 计算实际坐标
actual_x = location['x'] + x * size['width']
actual_y = location['y'] + y * size['height']

# 点击
ActionChains(driver).move_by_offset(actual_x, actual_y).click().perform()
time.sleep(0.5)

# 使用第三方服务识别点选验证码
def recognize_click_captcha(image_path):
"""识别点选验证码需要点击的位置"""

# 请求第三方打码平台
response = requests.post('http://xxx.com/api', json={
'image': open(image_path, 'rb').read(),
'type': 'click' # 点选类型
})

result = response.json()
return result.get('points', []) # 返回点击坐标列表

六、实战:微博登录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

def login_weibo(username, password):
"""微博登录"""

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)

try:
# 打开登录页
driver.get('https://weibo.com/login.php')
time.sleep(3)

# 切换到密码登录
try:
pwd_login = driver.find_element(By.XPATH, '//a[@action-type="btn_pwd_login"]')
pwd_login.click()
time.sleep(2)
except:
pass

# 输入账号
username_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
username_input.clear()
username_input.send_keys(username)

# 输入密码
password_input = driver.find_element(By.NAME, 'password')
password_input.clear()
password_input.send_keys(password)

# 点击登录
login_btn = driver.find_element(By.XPATH, '//button[@type="submit"]')
login_btn.click()

# 等待登录结果
time.sleep(5)

# 检查是否登录成功
if '我的首页' in driver.title or '个人中心' in driver.page_source:
print('✅ 微博登录成功!')
return driver
else:
print('❌ 微博登录失败')
return None

except Exception as e:
print(f'出错: {e}')
return None

# 使用
driver = login_weibo('你的微博账号', '你的微博密码')

七、实战:知乎登录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def login_zhihu(email, password):
"""知乎登录"""

driver = webdriver.Chrome(...)

try:
driver.get('https://www.zhihu.com/signin')
time.sleep(2)

# 点击密码登录
driver.find_element(By.XPATH, '//button[@class="SignFlow-tab SignFlow-tab--password"]').click()
time.sleep(1)

# 输入邮箱
driver.find_element(By.NAME, 'email').send_keys(email)

# 输入密码
driver.find_element(By.NAME, 'password').send_keys(password)

# 点击登录
driver.find_element(By.XPATH, '//button[@type="submit"]').click()

# 等待登录
time.sleep(5)

# 检查是否登录成功
if '首页' in driver.title:
print('✅ 知乎登录成功!')
return driver
else:
print('❌ 知乎登录失败')
return None

finally:
pass

八、完整登录管理器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
import pickle
import os

class LoginManager:
"""登录管理器"""

def __init__(self, cookie_file='cookies.pkl'):
self.session = requests.Session()
self.cookie_file = cookie_file

def save_cookies(self):
"""保存Cookies"""
with open(self.cookie_file, 'wb') as f:
pickle.dump(self.session.cookies, f)
print(f'✅ Cookies已保存到 {self.cookie_file}')

def load_cookies(self):
"""加载Cookies"""
if os.path.exists(self.cookie_file):
with open(self.cookie_file, 'rb') as f:
self.session.cookies.update(pickle.load(f))
print(f'✅ Cookies已加载')
return True
return False

def is_logged_in(self, check_url):
"""检查是否已登录"""
try:
response = self.session.get(check_url, timeout=5)
return response.status_code == 200
except:
return False

def login(self, login_url, data, check_url):
"""执行登录"""
# 先尝试使用保存的Cookie
if self.load_cookies():
if self.is_logged_in(check_url):
print('✅ 使用保存的Cookie登录成功!')
return True

# 需要重新登录
response = self.session.post(login_url, data=data)
if self.is_logged_in(check_url):
self.save_cookies()
print('✅ 登录成功并保存Cookie!')
return True
else:
print('❌ 登录失败')
return False

# 使用示例
manager = LoginManager('zhihu_cookies.pkl')

if manager.login(
login_url='https://www.zhihu.com/api/v3/oauth/captcha',
data={'email': 'xxx', 'password': 'xxx'},
check_url='https://www.zhihu.com/api/v4/me'
):
# 登录成功,可以爬取数据了
response = manager.session.get('https://www.zhihu.com/api/v4/me')
print(response.json())

九、常见问题

问题 解决方法
验证码识别率低 用第三方打码平台
滑块验证失败 优化滑动轨迹、加减速
登录后马上掉线 检查Token过期时间
账号被封 控制访问频率
多次滑动失败 尝试无痕模式

十、写在最后

这篇学会了:

1
2
3
4
5
6
7
POST表单登录
✅ Cookie保存与复用
✅ Selenium模拟登录
✅ 图片验证码识别
✅ 滑块验证码处理
✅ 第三方打码平台使用
✅ 登录管理器封装

下篇预告:Scrapy框架入门、高效分布式爬虫、IP代理池构建。

如果这篇文章对你有帮助,欢迎点赞+在看👍

有问题欢迎留言,我们一起进步!

#AI学习 #Python爬虫 #模拟登录 #验证码 #Selenium


12-Python爬虫高阶:模拟登录与验证码处理
https://yourname.github.io/2026/02/09/12-模拟登录与验证码/
作者
JA
发布于
2026年2月9日
许可协议