1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
| import requests from bs4 import BeautifulSoup import json import time
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
def parse_job(job_url): """解析单个职位详情""" try: response = requests.get(job_url, headers=headers, timeout=10) soup = BeautifulSoup(response.text, 'lxml') title = soup.find('h1').text.strip() if soup.find('h1') else '' salary = soup.find('span', class_='salary') salary = salary.text if salary else '' company = soup.find('a', class_='company') company = company.text if company else '' return {'title': title, 'salary': salary, 'company': company} except: return None
base_url = 'https://www.liepin.com/zhaopin?pn='
all_jobs = []
for page in range(1, 6): url = base_url + str(page) print(f'正在爬取第 {page} 页...') response = requests.get(url, headers=headers, timeout=10) soup = BeautifulSoup(response.text, 'lxml') jobs = soup.find_all('div', class_='job-card') for job in jobs: title = job.find('h3').text.strip() salary = job.find('span', class_='salary').text.strip() link = job.find('a')['href'] company = job.find('a', class_='company').text.strip() job_info = { '标题': title, '薪资': salary, '公司': company, '链接': link } all_jobs.append(job_info) print(f' ✅ {title} - {salary}') time.sleep(1)
with open('jobs.json', 'w', encoding='utf-8') as f: json.dump(all_jobs, f, ensure_ascii=False, indent=2)
print(f'\n🎉 共爬取 {len(all_jobs)} 个职位!') print('数据已保存到 jobs.json')
|