12-Python爬虫高阶：模拟登录与验证码处理

很多数据需要登录后才能访问，这篇教你搞定各种登录验证和验证码！

1. POST表单登录（直接提交账号密码）

import requests

session = requests.Session()

# 登录页面URL
login_url = 'https://example.com/login'

# 登录参数
data = {
    'username': 'your_account',
    'password': 'your_password',
    'remember': 'on'
}

# 发送POST请求登录
response = session.post(login_url, data=data)

# 检查登录结果
if response.status_code == 200:
    print('✅ 登录成功！')
else:
    print(f'❌ 登录失败，状态码: {response.status_code}')

# 登录后访问其他页面（会自动带上Cookie）
profile = session.get('https://example.com/profile')
print(profile.text)

2. 带请求头的登录

import requests

session = requests.Session()

# 请求头（模拟浏览器）
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Referer': 'https://example.com/login',
}

login_url = 'https://example.com/login'
data = {
    'username': 'your_account',
    'password': 'your_password',
}

response = session.post(login_url, data=data, headers=headers)

if '退出' in response.text or '欢迎您' in response.text:
    print('✅ 登录成功！')
else:
    print('❌ 登录失败')

3. 先获取Cookie再登录

有些网站需要先访问首页获取Cookie：

import requests

session = requests.Session()

# 1. 先访问首页，获取初始Cookie
session.get('https://example.com/')

# 2. 再提交登录信息
login_url = 'https://example.com/doLogin'
data = {
    'username': 'your_account',
    'password': 'your_password',
}

response = session.post(login_url, data=data)

print(response.text)

4. 使用已保存的Cookie

import requests
import json

session = requests.Session()

# 保存Cookie到文件
def save_cookies(session, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(session.cookies.get_dict(), f)

# 加载Cookie
def load_cookies(session, filename):
    with open(filename, 'r', encoding='utf-8') as f:
        cookies = json.load(f)
        session.cookies.update(cookies)

# 使用Cookie访问
load_cookies(session, 'cookies.json')
response = session.get('https://example.com/profile')
print(response.text)

二、Selenium模拟登录

1. 基础版

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

def login_website(url, username, password):
    """模拟登录网站"""
    
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    
    try:
        driver.get(url)
        time.sleep(2)
        
        # 输入账号
        driver.find_element(By.NAME, 'username').send_keys(username)
        
        # 输入密码
        driver.find_element(By.NAME, 'password').send_keys(password)
        
        # 点击登录
        driver.find_element(By.CLASS_NAME, 'login-btn').click()
        
        # 等待登录完成
        time.sleep(3)
        
        # 检查是否登录成功
        if '退出' in driver.page_source:
            print('✅ 登录成功！')
            return driver
        else:
            print('❌ 登录失败')
            return None
    
    except Exception as e:
        print(f'❌ 登录出错: {e}')
        return None

# 使用
driver = login_website('https://example.com/login', 'user', 'pass')

2. 登录后保持Session

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pickle  # 用于保存cookies

def login_and_save_cookies(url, username, password, cookie_file):
    """登录并保存Cookies"""
    
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=Options().add_argument('--headless')
    )
    
    try:
        # 登录
        driver.get(url)
        # ... 输入账号密码点击登录 ...
        
        # 保存cookies
        pickle.dump(driver.get_cookies(), open(cookie_file, 'wb'))
        print('✅ Cookies已保存！')
        
        return driver
    
    finally:
        pass  # 不关闭，继续用

# 加载Cookies
def load_cookies_and_visit(url, cookie_file):
    driver = webdriver.Chrome(...)
    driver.get(url)
    
    # 加载cookies
    cookies = pickle.load(open(cookie_file, 'rb'))
    for cookie in cookies:
        driver.add_cookie(cookie)
    
    driver.refresh()  # 刷新页面
    return driver

三、验证码处理

1. 图片验证码（Tesseract识别）

安装Tesseract

# Windows: 下载安装包
# https://github.com/UB-Mannheim/tesseract/wiki

# 安装Python库
pip install pytesseract pillow

识别验证码

from PIL import Image
import pytesseract

# 配置Tesseract路径（Windows需要）
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# 打开图片
image = Image.open('captcha.png')

# 识别
text = pytesseract.image_to_string(image)
print(f'识别结果: {text}')

预处理提高识别率

from PIL import Image, ImageEnhance, ImageFilter
import pytesseract

def recognize_captcha(image_path):
    """预处理后识别验证码"""
    
    image = Image.open(image_path)
    
    # 转为灰度图
    image = image.convert('L')
    
    # 二值化处理
    threshold = 128
    image = image.point(lambda p: 255 if p > threshold else 0)
    
    # 去噪点
    image = image.filter(ImageFilter.MedianFilter())
    
    # 增强对比度
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)
    
    # 保存处理后的图片（可查看效果）
    image.save('processed_captcha.png')
    
    # 识别
    text = pytesseract.image_to_string(image, config='--psm 6')
    
    return text.strip()

# 使用
result = recognize_captcha('captcha.png')
print(f'验证码: {result}')

简单验证码识别实战

import requests
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pytesseract

def login_with_captcha(url, username, password):
    """带验证码的登录"""
    
    driver = webdriver.Chrome(...)
    
    try:
        driver.get(url)
        
        # 下载验证码图片
        captcha_element = driver.find_element(By.ID, 'captcha-img')
        captcha_url = captcha_element.get_attribute('src')
        
        # 保存验证码
        captcha_response = requests.get(captcha_url)
        with open('captcha.png', 'wb') as f:
            f.write(captcha_response.content)
        
        # 识别验证码
        captcha_text = recognize_captcha('captcha.png')
        print(f'识别到验证码: {captcha_text}')
        
        # 输入账号密码和验证码
        driver.find_element(By.NAME, 'username').send_keys(username)
        driver.find_element(By.NAME, 'password').send_keys(password)
        driver.find_element(By.NAME, 'captcha').send_keys(captcha_text)
        
        # 点击登录
        driver.find_element(By.CLASS_NAME, 'login-btn').click()
        
        time.sleep(2)
        return driver
    
    except Exception as e:
        print(f'出错: {e}')
        return None

2. 第三方打码平台（推荐）

对于复杂验证码，使用第三方平台识别率更高：

超级鹰

# 注册超级鹰：http://www.chaojiying.com/

import requests
import base64

def recognize_with_chaojiying(image_path, user, password, soft_id):
    """使用超级鹰识别验证码"""
    
    # 读取图片并base64编码
    with open(image_path, 'rb') as f:
        img_data = base64.b64encode(f.read()).decode('utf-8')
    
    # 请求参数
    data = {
        'user': user,
        'pass': password,
        'softid': soft_id,
        'imgdata': img_data,
    }
    
    # 请求识别
    response = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=data)
    result = response.json()
    
    if result.get('err_no') == 0:
        return result.get('pic_str')
    else:
        print(f'识别失败: {result}')
        return None

# 使用
code = recognize_with_chaojiying('captcha.png', '用户名', '密码', '软件ID')
print(f'验证码: {code}')

阿里云验证码服务

# 阿里云验证码OCR
from alibabacloud_cr20220701.client import Client
from alibabacloud_tea_openapi.models import Config

config = Config(
    access_key_id='你的ID',
    access_key_secret='你的密钥',
    endpoint='ocr.cn-shanghai.aliyuncs.com'
)

client = Client(config)

body = bytes(open('captcha.png', 'rb').read())

response = client.predict_body({
    'image': body
})
print(response.body)

四、滑块验证码处理

滑块验证码需要模拟人类滑动轨迹：

1. 基本思路

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random

def move_to_slider(driver, slider_element, distance):
    """
    移动滑块到目标位置
    distance: 需要滑动的距离
    """
    
    # 点击滑块
    slider_element.click()
    
    # 生成滑动轨迹
    tracks = get_slide_tracks(distance)
    
    # 按轨迹滑动
    ActionChains(driver).click_and_hold(slider_element).perform()
    
    for x in tracks:
        ActionChains(driver).move_by_offset(x=x, y=random.randint(-2, 2)).perform()
        time.sleep(random.randint(10, 30) / 1000)
    
    # 释放
    ActionChains(driver).release().perform()

def get_slide_tracks(distance):
    """
    生成滑动轨迹
    模拟人类滑动：先快后慢，带有抖动
    """
    tracks = []
    current = 0
    mid = distance * 0.7  # 加速到70%位置
    t = random.randint(10, 15)  # 随机时间
    
    while current < distance:
        if current < mid:
            # 加速阶段
            move = random.randint(5, 10)
        else:
            # 减速阶段
            move = random.randint(2, 5)
        
        current += move
        tracks.append(move)
    
    # 补充剩余距离
    if current < distance:
        tracks.append(distance - current)
    
    return tracks

2. 完整滑块验证处理

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
import time
import random
import requests
import numpy as np

def get_distance(bg_image_url, slider_image_url):
    """计算缺口距离"""
    
    # 下载图片
    bg_data = requests.get(bg_image_url).content
    slider_data = requests.get(slider_image_url).content
    
    # 保存
    with open('bg.jpg', 'wb') as f:
        f.write(bg_data)
    with open('slider.png', 'wb') as f:
        f.write(slider_data)
    
    # 打开图片
    bg = Image.open('bg.jpg')
    slider = Image.open('slider.png')
    
    # 转灰度
    bg_gray = bg.convert('L')
    slider_gray = slider.convert('L')
    
    # 简单缺口检测
    bg_array = np.array(bg_gray)
    slider_array = np.array(slider_gray)
    
    # 计算差值
    min_diff = float('inf')
    best_x = 0
    
    for x in range(bg_array.shape[1] - slider_array.shape[1]):
        diff = np.sum(np.abs(bg_array[:, x:x+slider_array.shape[1]] - slider_array))
        if diff < min_diff:
            min_diff = diff
            best_x = x
    
    return best_x

def slide_verify(driver, url):
    """处理滑块验证"""
    
    driver.get(url)
    time.sleep(2)
    
    try:
        # 等待滑块出现
        slider = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'slider-btn'))
        )
        
        # 获取背景图和滑块图URL
        bg_url = driver.find_element(By.CLASS_NAME, 'bg-image').get_attribute('src')
        slider_url = driver.find_element(By.CLASS_NAME, 'slider-image').get_attribute('src')
        
        # 计算距离
        distance = get_distance(bg_url, slider_url)
        print(f'需要滑动的距离: {distance}')
        
        # 滑动
        move_to_slider(driver, slider, distance)
        
        # 等待验证结果
        time.sleep(2)
        
        # 检查是否验证成功
        if '验证通过' in driver.page_source:
            print('✅ 滑块验证通过！')
            return True
        else:
            print('❌ 滑块验证失败')
            return False
    
    except Exception as e:
        print(f'滑块验证出错: {e}')
        return False

五、点选验证码

点选验证码需要识别图片中的特定元素：

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

def click_points(driver, xpath_template, points):
    """
    点击指定坐标
    points: [(x1, y1), (x2, y2), ...]
    """
    for i, (x, y) in enumerate(points):
        # 先点击验证码区域（确保坐标正确）
        canvas = driver.find_element(By.XPATH, xpath_template.format(i))
        
        # 获取元素位置和大小
        location = canvas.location
        size = canvas.size
        
        # 计算实际坐标
        actual_x = location['x'] + x * size['width']
        actual_y = location['y'] + y * size['height']
        
        # 点击
        ActionChains(driver).move_by_offset(actual_x, actual_y).click().perform()
        time.sleep(0.5)

# 使用第三方服务识别点选验证码
def recognize_click_captcha(image_path):
    """识别点选验证码需要点击的位置"""
    
    # 请求第三方打码平台
    response = requests.post('http://xxx.com/api', json={
        'image': open(image_path, 'rb').read(),
        'type': 'click'  # 点选类型
    })
    
    result = response.json()
    return result.get('points', [])  # 返回点击坐标列表

六、实战：微博登录

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

def login_weibo(username, password):
    """微博登录"""
    
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-blink-features=AutomationControlled')
    
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    
    try:
        # 打开登录页
        driver.get('https://weibo.com/login.php')
        time.sleep(3)
        
        # 切换到密码登录
        try:
            pwd_login = driver.find_element(By.XPATH, '//a[@action-type="btn_pwd_login"]')
            pwd_login.click()
            time.sleep(2)
        except:
            pass
        
        # 输入账号
        username_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, 'username'))
        )
        username_input.clear()
        username_input.send_keys(username)
        
        # 输入密码
        password_input = driver.find_element(By.NAME, 'password')
        password_input.clear()
        password_input.send_keys(password)
        
        # 点击登录
        login_btn = driver.find_element(By.XPATH, '//button[@type="submit"]')
        login_btn.click()
        
        # 等待登录结果
        time.sleep(5)
        
        # 检查是否登录成功
        if '我的首页' in driver.title or '个人中心' in driver.page_source:
            print('✅ 微博登录成功！')
            return driver
        else:
            print('❌ 微博登录失败')
            return None
    
    except Exception as e:
        print(f'出错: {e}')
        return None

# 使用
driver = login_weibo('你的微博账号', '你的微博密码')

七、实战：知乎登录

def login_zhihu(email, password):
    """知乎登录"""
    
    driver = webdriver.Chrome(...)
    
    try:
        driver.get('https://www.zhihu.com/signin')
        time.sleep(2)
        
        # 点击密码登录
        driver.find_element(By.XPATH, '//button[@class="SignFlow-tab SignFlow-tab--password"]').click()
        time.sleep(1)
        
        # 输入邮箱
        driver.find_element(By.NAME, 'email').send_keys(email)
        
        # 输入密码
        driver.find_element(By.NAME, 'password').send_keys(password)
        
        # 点击登录
        driver.find_element(By.XPATH, '//button[@type="submit"]').click()
        
        # 等待登录
        time.sleep(5)
        
        # 检查是否登录成功
        if '首页' in driver.title:
            print('✅ 知乎登录成功！')
            return driver
        else:
            print('❌ 知乎登录失败')
            return None
    
    finally:
        pass

八、完整登录管理器

import requests
import pickle
import os

class LoginManager:
    """登录管理器"""
    
    def __init__(self, cookie_file='cookies.pkl'):
        self.session = requests.Session()
        self.cookie_file = cookie_file
    
    def save_cookies(self):
        """保存Cookies"""
        with open(self.cookie_file, 'wb') as f:
            pickle.dump(self.session.cookies, f)
        print(f'✅ Cookies已保存到 {self.cookie_file}')
    
    def load_cookies(self):
        """加载Cookies"""
        if os.path.exists(self.cookie_file):
            with open(self.cookie_file, 'rb') as f:
                self.session.cookies.update(pickle.load(f))
            print(f'✅ Cookies已加载')
            return True
        return False
    
    def is_logged_in(self, check_url):
        """检查是否已登录"""
        try:
            response = self.session.get(check_url, timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def login(self, login_url, data, check_url):
        """执行登录"""
        # 先尝试使用保存的Cookie
        if self.load_cookies():
            if self.is_logged_in(check_url):
                print('✅ 使用保存的Cookie登录成功！')
                return True
        
        # 需要重新登录
        response = self.session.post(login_url, data=data)
        if self.is_logged_in(check_url):
            self.save_cookies()
            print('✅ 登录成功并保存Cookie！')
            return True
        else:
            print('❌ 登录失败')
            return False

# 使用示例
manager = LoginManager('zhihu_cookies.pkl')

if manager.login(
    login_url='https://www.zhihu.com/api/v3/oauth/captcha',
    data={'email': 'xxx', 'password': 'xxx'},
    check_url='https://www.zhihu.com/api/v4/me'
):
    # 登录成功，可以爬取数据了
    response = manager.session.get('https://www.zhihu.com/api/v4/me')
    print(response.json())

九、常见问题

问题	解决方法
验证码识别率低	用第三方打码平台
滑块验证失败	优化滑动轨迹、加减速
登录后马上掉线	检查Token过期时间
账号被封	控制访问频率
多次滑动失败	尝试无痕模式

十、写在最后

这篇学会了：

✅ POST表单登录
✅ Cookie保存与复用
✅ Selenium模拟登录
✅ 图片验证码识别
✅ 滑块验证码处理
✅ 第三方打码平台使用
✅ 登录管理器封装

下篇预告：Scrapy框架入门、高效分布式爬虫、IP代理池构建。

如果这篇文章对你有帮助，欢迎点赞+在看👍

有问题欢迎留言，我们一起进步！

#AI学习 #Python爬虫 #模拟登录 #验证码 #Selenium

爬虫教程

#Python #爬虫 #模拟登录 #验证码

12-Python爬虫高阶：模拟登录与验证码处理

https://yourname.github.io/2026/02/09/12-模拟登录与验证码/

作者

发布于

2026年2月9日

许可协议

13-Python爬虫：Scrapy框架入门上一篇

11-Python爬虫高阶：Selenium处理动态网页下一篇