10-Python爬虫进阶：批量下载翻页爬取

上篇讲了基础爬虫，这篇来点实用的：批量下载图片、多页面翻页、数据存储到本地！

1. 爬取单张图片

import requests

url = 'https://example.com/image.jpg'

# 方法1：直接保存
response = requests.get(url)
with open('image.jpg', 'wb') as f:
    f.write(response.content)
print('下载完成！')

# 方法2：带请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)

# 保存
with open('image.jpg', 'wb') as f:
    f.write(response.content)

2. 批量下载多张图片

import requests
import os
from bs4 import BeautifulSoup

# 创建保存文件夹
os.makedirs('images', exist_ok=True)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 图片URL列表（示例）
image_urls = [
    'https://example.com/img1.jpg',
    'https://example.com/img2.jpg',
    'https://example.com/img3.jpg',
]

for i, url in enumerate(image_urls, 1):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            # 保存图片
            filename = f'images/image_{i}.jpg'
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f'✅ 第{i}张下载成功: {filename}')
        else:
            print(f'❌ 第{i}张下载失败，状态码: {response.status_code}')
    except Exception as e:
        print(f'❌ 第{i}张下载出错: {e}')

print('\n🎉 全部下载完成！')

3. 实战：从网页提取图片URL并下载

import requests
from bs4 import BeautifulSoup
import os
import time

os.makedirs('wallpapers', exist_ok=True)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 目标网页
url = 'https://example.com/wallpapers'

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')

# 查找所有img标签
images = soup.find_all('img')

for i, img in enumerate(images, 1):
    src = img.get('src')  # 获取图片URL
    if src and src.startswith('http'):
        try:
            # 发送请求下载
            img_response = requests.get(src, headers=headers, timeout=10)
            if img_response.status_code == 200:
                # 生成文件名
                ext = src.split('.')[-1].split('?')[0][:4]  # 获取扩展名
                filename = f'wallpapers/img_{i}.{ext}'
                with open(filename, 'wb') as f:
                    f.write(img_response.content)
                print(f'✅ {filename}')
                time.sleep(0.5)  # 延时，避免被封
        except Exception as e:
            print(f'❌ 下载失败: {src}, 错误: {e}')

print('\n🎉 图片批量下载完成！')

二、多页面翻页爬取

1. 基础翻页

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 基础URL（假设分页参数是 ?page=1, ?page=2...）
base_url = 'https://example.com/articles?page='

for page in range(1, 11):  # 爬取前10页
    url = base_url + str(page)
    print(f'正在爬取第 {page} 页: {url}')
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # 提取本页内容
    articles = soup.find_all('article')
    for article in articles:
        title = article.find('h2').text.strip()
        print(f'  - {title}')

2. 实战：爬取豆瓣电影Top250

import requests
from bs4 import BeautifulSoup
import csv
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 保存到CSV
with open('douban_top250.csv', 'w', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['排名', '标题', '评分', '评价人数', '链接'])

# 豆瓣电影Top250分5页，每页50部
for page in range(0, 250, 25):
    url = f'https://movie.douban.com/top250?start={page}&filter='
    print(f'正在爬取: {url}')
    
    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # 提取电影信息
    movies = soup.find_all('div', class_='item')
    for movie in movies:
        # 排名
        rank = movie.find('em').text
        
        # 标题
        title = movie.find('span', class_='title').text
        title_en = movie.find('span', class_='other')
        if title_en:
            title += title_en.text[2:]  # 去掉" /"
        
        # 评分
        rating = movie.find('span', class_='rating_num').text
        
        # 评价人数
        votes = movie.find('span', text=lambda t: t and '人评价' in t).text
        votes = votes.replace('人评价', '')
        
        # 链接
        link = movie.find('a')['href']
        
        # 写入CSV
        writer.writerow([rank, title, rating, votes, link])
        print(f'  ✅ {rank}. {title} - {rating}')
    
    time.sleep(1)  # 延时，避免被封

print('\n🎉 豆瓣Top250爬取完成！数据已保存到 douban_top250.csv')

3. URL参数编码问题

import requests
from urllib.parse import urlencode

params = {
    'wd': 'Python教程',  # 搜索词
    'pn': 5             # 页码
}

base_url = 'https://www.baidu.com/s?'

# 自动编码
url = base_url + urlencode(params)
print(url)
# 输出: https://www.baidu.com/s?wd=Python%E6%95%99%E7%A8%8B&pn=5

三、数据存储

1. 保存为JSON

import json

data = [
    {'name': '张三', 'age': 25, 'city': '北京'},
    {'name': '李四', 'age': 30, 'city': '上海'},
    {'name': '王五', 'age': 28, 'city': '广州'},
]

# 写入JSON
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# 读取JSON
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    print(data)

2. 保存为CSV

import csv

# 写入CSV
with open('data.csv', 'w', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['姓名', '年龄', '城市'])      # 表头
    writer.writerow(['张三', 25, '北京'])
    writer.writerow(['李四', 30, '上海'])

# 读取CSV
with open('data.csv', 'r', encoding='utf-8-sig') as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

3. 保存为TXT

# 追加写入
with open('log.txt', 'a', encoding='utf-8') as f:
    f.write('这是一条日志记录\n')

# 覆盖写入
with open('article.txt', 'w', encoding='utf-8') as f:
    f.write('这是文章内容...\n')

4. 保存到Excel

import pandas as pd

data = {
    '姓名': ['张三', '李四', '王五'],
    '年龄': [25, 30, 28],
    '城市': ['北京', '上海', '广州']
}

df = pd.DataFrame(data)
df.to_excel('data.xlsx', index=False)

# 读取
df = pd.read_excel('data.xlsx')
print(df)

四、综合实战：爬取招聘网站职位

import requests
from bs4 import BeautifulSoup
import json
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

def parse_job(job_url):
    """解析单个职位详情"""
    try:
        response = requests.get(job_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'lxml')
        
        title = soup.find('h1').text.strip() if soup.find('h1') else ''
        salary = soup.find('span', class_='salary')
        salary = salary.text if salary else ''
        company = soup.find('a', class_='company')
        company = company.text if company else ''
        
        return {'title': title, 'salary': salary, 'company': company}
    except:
        return None

# 招聘列表页
base_url = 'https://www.liepin.com/zhaopin?pn='

all_jobs = []

for page in range(1, 6):  # 前5页
    url = base_url + str(page)
    print(f'正在爬取第 {page} 页...')
    
    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # 查找职位列表
    jobs = soup.find_all('div', class_='job-card')
    for job in jobs:
        title = job.find('h3').text.strip()
        salary = job.find('span', class_='salary').text.strip()
        link = job.find('a')['href']
        company = job.find('a', class_='company').text.strip()
        
        job_info = {
            '标题': title,
            '薪资': salary,
            '公司': company,
            '链接': link
        }
        all_jobs.append(job_info)
        print(f'  ✅ {title} - {salary}')
    
    time.sleep(1)

# 保存数据
with open('jobs.json', 'w', encoding='utf-8') as f:
    json.dump(all_jobs, f, ensure_ascii=False, indent=2)

print(f'\n🎉 共爬取 {len(all_jobs)} 个职位！')
print('数据已保存到 jobs.json')

五、反爬虫注意事项

反爬手段	应对方法
User-Agent检测	每请求换一个UA
IP限制	使用代理IP
登录验证	保持Cookie
验证码	手动打码/第三方API
请求频率限制	加延时 time.sleep()
AJAX动态加载	用Selenium（后面讲）

代理IP使用

import requests

# 代理IP（示例，需要购买代理服务）
proxies = {
    'http': 'http://123.45.67.89:8080',
    'https': 'http://123.45.67.89:8080',
}

response = requests.get(url, proxies=proxies, timeout=10)

六、写在最后

这篇学会了：

✅ 批量下载图片/文件
✅ 多页面翻页爬取
✅ 数据存储（JSON/CSV/Excel）
✅ 应对反爬虫基础

下篇预告：Selenium处理动态网页（JS加载的内容）、模拟登录、验证码处理。

如果这篇文章对你有帮助，欢迎点赞+在看👍

有问题欢迎留言，我们一起进步！

#AI学习 #Python爬虫 #批量下载 #翻页爬取 #数据存储

爬虫教程

#Python #爬虫 #数据存储

10-Python爬虫进阶：批量下载翻页爬取

https://yourname.github.io/2026/02/07/10-Python爬虫进阶/

作者

发布于

2026年2月7日

许可协议

11-Python爬虫高阶：Selenium处理动态网页上一篇

09-Python爬虫入门：爬取网页其实很简单下一篇