10-Python爬虫进阶:批量下载翻页爬取

上篇讲了基础爬虫,这篇来点实用的:批量下载图片、多页面翻页、数据存储到本地!

1. 爬取单张图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests

url = 'https://example.com/image.jpg'

# 方法1:直接保存
response = requests.get(url)
with open('image.jpg', 'wb') as f:
f.write(response.content)
print('下载完成!')

# 方法2:带请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)

# 保存
with open('image.jpg', 'wb') as f:
f.write(response.content)

2. 批量下载多张图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import requests
import os
from bs4 import BeautifulSoup

# 创建保存文件夹
os.makedirs('images', exist_ok=True)

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 图片URL列表(示例)
image_urls = [
'https://example.com/img1.jpg',
'https://example.com/img2.jpg',
'https://example.com/img3.jpg',
]

for i, url in enumerate(image_urls, 1):
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
# 保存图片
filename = f'images/image_{i}.jpg'
with open(filename, 'wb') as f:
f.write(response.content)
print(f'✅ 第{i}张下载成功: {filename}')
else:
print(f'❌ 第{i}张下载失败,状态码: {response.status_code}')
except Exception as e:
print(f'❌ 第{i}张下载出错: {e}')

print('\n🎉 全部下载完成!')

3. 实战:从网页提取图片URL并下载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import requests
from bs4 import BeautifulSoup
import os
import time

os.makedirs('wallpapers', exist_ok=True)

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 目标网页
url = 'https://example.com/wallpapers'

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')

# 查找所有img标签
images = soup.find_all('img')

for i, img in enumerate(images, 1):
src = img.get('src') # 获取图片URL
if src and src.startswith('http'):
try:
# 发送请求下载
img_response = requests.get(src, headers=headers, timeout=10)
if img_response.status_code == 200:
# 生成文件名
ext = src.split('.')[-1].split('?')[0][:4] # 获取扩展名
filename = f'wallpapers/img_{i}.{ext}'
with open(filename, 'wb') as f:
f.write(img_response.content)
print(f'✅ {filename}')
time.sleep(0.5) # 延时,避免被封
except Exception as e:
print(f'❌ 下载失败: {src}, 错误: {e}')

print('\n🎉 图片批量下载完成!')

二、多页面翻页爬取

1. 基础翻页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests
from bs4 import BeautifulSoup

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 基础URL(假设分页参数是 ?page=1, ?page=2...)
base_url = 'https://example.com/articles?page='

for page in range(1, 11): # 爬取前10页
url = base_url + str(page)
print(f'正在爬取第 {page} 页: {url}')

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')

# 提取本页内容
articles = soup.find_all('article')
for article in articles:
title = article.find('h2').text.strip()
print(f' - {title}')

2. 实战:爬取豆瓣电影Top250

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import csv
import time

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# 保存到CSV
with open('douban_top250.csv', 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow(['排名', '标题', '评分', '评价人数', '链接'])

# 豆瓣电影Top250分5页,每页50部
for page in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={page}&filter='
print(f'正在爬取: {url}')

response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')

# 提取电影信息
movies = soup.find_all('div', class_='item')
for movie in movies:
# 排名
rank = movie.find('em').text

# 标题
title = movie.find('span', class_='title').text
title_en = movie.find('span', class_='other')
if title_en:
title += title_en.text[2:] # 去掉" /"

# 评分
rating = movie.find('span', class_='rating_num').text

# 评价人数
votes = movie.find('span', text=lambda t: t and '人评价' in t).text
votes = votes.replace('人评价', '')

# 链接
link = movie.find('a')['href']

# 写入CSV
writer.writerow([rank, title, rating, votes, link])
print(f' ✅ {rank}. {title} - {rating}')

time.sleep(1) # 延时,避免被封

print('\n🎉 豆瓣Top250爬取完成!数据已保存到 douban_top250.csv')

3. URL参数编码问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import requests
from urllib.parse import urlencode

params = {
'wd': 'Python教程', # 搜索词
'pn': 5 # 页码
}

base_url = 'https://www.baidu.com/s?'

# 自动编码
url = base_url + urlencode(params)
print(url)
# 输出: https://www.baidu.com/s?wd=Python%E6%95%99%E7%A8%8B&pn=5

三、数据存储

1. 保存为JSON

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import json

data = [
{'name': '张三', 'age': 25, 'city': '北京'},
{'name': '李四', 'age': 30, 'city': '上海'},
{'name': '王五', 'age': 28, 'city': '广州'},
]

# 写入JSON
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)

# 读取JSON
with open('data.json', 'r', encoding='utf-8') as f:
data = json.load(f)
print(data)

2. 保存为CSV

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import csv

# 写入CSV
with open('data.csv', 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow(['姓名', '年龄', '城市']) # 表头
writer.writerow(['张三', 25, '北京'])
writer.writerow(['李四', 30, '上海'])

# 读取CSV
with open('data.csv', 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
for row in reader:
print(row)

3. 保存为TXT

1
2
3
4
5
6
7
# 追加写入
with open('log.txt', 'a', encoding='utf-8') as f:
f.write('这是一条日志记录\n')

# 覆盖写入
with open('article.txt', 'w', encoding='utf-8') as f:
f.write('这是文章内容...\n')

4. 保存到Excel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import pandas as pd

data = {
'姓名': ['张三', '李四', '王五'],
'年龄': [25, 30, 28],
'城市': ['北京', '上海', '广州']
}

df = pd.DataFrame(data)
df.to_excel('data.xlsx', index=False)

# 读取
df = pd.read_excel('data.xlsx')
print(df)

四、综合实战:爬取招聘网站职位

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
from bs4 import BeautifulSoup
import json
import time

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

def parse_job(job_url):
"""解析单个职位详情"""
try:
response = requests.get(job_url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')

title = soup.find('h1').text.strip() if soup.find('h1') else ''
salary = soup.find('span', class_='salary')
salary = salary.text if salary else ''
company = soup.find('a', class_='company')
company = company.text if company else ''

return {'title': title, 'salary': salary, 'company': company}
except:
return None

# 招聘列表页
base_url = 'https://www.liepin.com/zhaopin?pn='

all_jobs = []

for page in range(1, 6): # 前5页
url = base_url + str(page)
print(f'正在爬取第 {page} 页...')

response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')

# 查找职位列表
jobs = soup.find_all('div', class_='job-card')
for job in jobs:
title = job.find('h3').text.strip()
salary = job.find('span', class_='salary').text.strip()
link = job.find('a')['href']
company = job.find('a', class_='company').text.strip()

job_info = {
'标题': title,
'薪资': salary,
'公司': company,
'链接': link
}
all_jobs.append(job_info)
print(f' ✅ {title} - {salary}')

time.sleep(1)

# 保存数据
with open('jobs.json', 'w', encoding='utf-8') as f:
json.dump(all_jobs, f, ensure_ascii=False, indent=2)

print(f'\n🎉 共爬取 {len(all_jobs)} 个职位!')
print('数据已保存到 jobs.json')

五、反爬虫注意事项

反爬手段 应对方法
User-Agent检测 每请求换一个UA
IP限制 使用代理IP
登录验证 保持Cookie
验证码 手动打码/第三方API
请求频率限制 加延时 time.sleep()
AJAX动态加载 用Selenium(后面讲)

代理IP使用

1
2
3
4
5
6
7
8
9
import requests

# 代理IP(示例,需要购买代理服务)
proxies = {
'http': 'http://123.45.67.89:8080',
'https': 'http://123.45.67.89:8080',
}

response = requests.get(url, proxies=proxies, timeout=10)

六、写在最后

这篇学会了:

1
2
3
4
✅ 批量下载图片/文件
✅ 多页面翻页爬取
✅ 数据存储(JSON/CSV/Excel)
✅ 应对反爬虫基础

下篇预告:Selenium处理动态网页(JS加载的内容)、模拟登录、验证码处理。

如果这篇文章对你有帮助,欢迎点赞+在看👍

有问题欢迎留言,我们一起进步!

#AI学习 #Python爬虫 #批量下载 #翻页爬取 #数据存储


10-Python爬虫进阶:批量下载翻页爬取
https://yourname.github.io/2026/02/07/10-Python爬虫进阶/
作者
JA
发布于
2026年2月7日
许可协议