1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from selenium.common.exceptions import TimeoutException
def scrape_comments(product_url, max_pages=5): """爬取商品评论""" options = Options() options.add_argument('--headless') options.add_argument('--disable-gpu') options.add_argument('--window-size=1920,1080') driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=options ) all_comments = [] try: driver.get(product_url) print(f'正在爬取: {driver.title}') WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CLASS_NAME, 'tab-comment')) ).click() for page in range(1, max_pages + 1): print(f'正在爬取第 {page} 页...') WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'comment-item')) ) comments = driver.find_elements(By.CLASS_NAME, 'comment-item') for comment in comments: try: text = comment.find_element(By.CLASS_NAME, 'comment-text').text user = comment.find_element(By.CLASS_NAME, 'user-name').text rating = comment.find_element(By.CLASS_NAME, 'rating').get_attribute('title') all_comments.append({ '用户': user, '评分': rating, '内容': text }) except: continue print(f' ✅ 已获取 {len(all_comments)} 条评论') try: next_btn = driver.find_element(By.CLASS_NAME, 'next-btn') if 'disabled' in next_btn.get_attribute('class'): print('✅ 已到达最后一页') break next_btn.click() time.sleep(2) except: print('✅ 没有下一页了') break return all_comments finally: driver.quit()
comments = scrape_comments('https://item.jd.com/100000000.html', max_pages=3) print(f'\n🎉 共爬取 {len(comments)} 条评论!')
import json with open('comments.json', 'w', encoding='utf-8') as f: json.dump(comments, f, ensure_ascii=False, indent=2)
|