很多数据需要登录后才能访问,这篇教你搞定各种登录验证和验证码!
1. POST表单登录(直接提交账号密码) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 import requests session = requests.Session() login_url = 'https://example.com/login' data = { 'username' : 'your_account' , 'password' : 'your_password' , 'remember' : 'on' } response = session.post(login_url, data=data)if response.status_code == 200 : print ('✅ 登录成功!' )else : print (f'❌ 登录失败,状态码: {response.status_code} ' ) profile = session.get('https://example.com/profile' )print (profile.text)
2. 带请求头的登录 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 import requests session = requests.Session() headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' , 'Referer' : 'https://example.com/login' , } login_url = 'https://example.com/login' data = { 'username' : 'your_account' , 'password' : 'your_password' , } response = session.post(login_url, data=data, headers=headers)if '退出' in response.text or '欢迎您' in response.text: print ('✅ 登录成功!' )else : print ('❌ 登录失败' )
3. 先获取Cookie再登录 有些网站需要先访问首页获取Cookie:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 import requests session = requests.Session() session.get('https://example.com/' ) login_url = 'https://example.com/doLogin' data = { 'username' : 'your_account' , 'password' : 'your_password' , } response = session.post(login_url, data=data)print (response.text)
4. 使用已保存的Cookie 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 import requestsimport json session = requests.Session()def save_cookies (session, filename ): with open (filename, 'w' , encoding='utf-8' ) as f: json.dump(session.cookies.get_dict(), f)def load_cookies (session, filename ): with open (filename, 'r' , encoding='utf-8' ) as f: cookies = json.load(f) session.cookies.update(cookies) load_cookies(session, 'cookies.json' ) response = session.get('https://example.com/profile' )print (response.text)
二、Selenium模拟登录 1. 基础版 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManagerimport timedef login_website (url, username, password ): """模拟登录网站""" options = Options() options.add_argument('--headless' ) driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=options try : driver.get(url) time.sleep(2 ) driver.find_element(By.NAME, 'username' ).send_keys(username) driver.find_element(By.NAME, 'password' ).send_keys(password) driver.find_element(By.CLASS_NAME, 'login-btn' ).click() time.sleep(3 ) if '退出' in driver.page_source: print ('✅ 登录成功!' ) return driver else : print ('❌ 登录失败' ) return None except Exception as e: print (f'❌ 登录出错: {e} ' ) return None driver = login_website('https://example.com/login' , 'user' , 'pass' )
2. 登录后保持Session 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 from selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom webdriver_manager.chrome import ChromeDriverManagerimport pickle def login_and_save_cookies (url, username, password, cookie_file ): """登录并保存Cookies""" driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=Options().add_argument('--headless' ) ) try : driver.get(url) pickle.dump(driver.get_cookies(), open (cookie_file, 'wb' )) print ('✅ Cookies已保存!' ) return driver finally : pass def load_cookies_and_visit (url, cookie_file ): driver = webdriver.Chrome(...) driver.get(url) cookies = pickle.load(open (cookie_file, 'rb' )) for cookie in cookies: driver.add_cookie(cookie) driver.refresh() return driver
三、验证码处理 1. 图片验证码(Tesseract识别) 安装Tesseract 1 2 3 4 5 pip install pytesseract pillow
识别验证码 1 2 3 4 5 6 7 8 9 10 11 12 from PIL import Imageimport pytesseract pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' image = Image.open ('captcha.png' ) text = pytesseract.image_to_string(image)print (f'识别结果: {text} ' )
预处理提高识别率 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 from PIL import Image, ImageEnhance, ImageFilterimport pytesseractdef recognize_captcha (image_path ): """预处理后识别验证码""" image = Image.open (image_path) image = image.convert('L' ) threshold = 128 image = image.point(lambda p: 255 if p > threshold else 0 ) image = image.filter (ImageFilter.MedianFilter()) enhancer = ImageEnhance.Contrast(image) image = enhancer.enhance(2 ) image.save('processed_captcha.png' ) text = pytesseract.image_to_string(image, config='--psm 6' ) return text.strip() result = recognize_captcha('captcha.png' )print (f'验证码: {result} ' )
简单验证码识别实战 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 import requestsfrom PIL import Imagefrom selenium import webdriverfrom selenium.webdriver.common.by import Byimport timeimport pytesseractdef login_with_captcha (url, username, password ): """带验证码的登录""" driver = webdriver.Chrome(...) try : driver.get(url) captcha_element = driver.find_element(By.ID, 'captcha-img' ) captcha_url = captcha_element.get_attribute('src' ) captcha_response = requests.get(captcha_url) with open ('captcha.png' , 'wb' ) as f: f.write(captcha_response.content) captcha_text = recognize_captcha('captcha.png' ) print (f'识别到验证码: {captcha_text} ' ) driver.find_element(By.NAME, 'username' ).send_keys(username) driver.find_element(By.NAME, 'password' ).send_keys(password) driver.find_element(By.NAME, 'captcha' ).send_keys(captcha_text) driver.find_element(By.CLASS_NAME, 'login-btn' ).click() time.sleep(2 ) return driver except Exception as e: print (f'出错: {e} ' ) return None
2. 第三方打码平台(推荐) 对于复杂验证码,使用第三方平台识别率更高:
超级鹰 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 import requestsimport base64def recognize_with_chaojiying (image_path, user, password, soft_id ): """使用超级鹰识别验证码""" with open (image_path, 'rb' ) as f: img_data = base64.b64encode(f.read()).decode('utf-8' ) data = { 'user' : user, 'pass' : password, 'softid' : soft_id, 'imgdata' : img_data, } response = requests.post('http://upload.chaojiying.net/Upload/Processing.php' , data=data) result = response.json() if result.get('err_no' ) == 0 : return result.get('pic_str' ) else : print (f'识别失败: {result} ' ) return None code = recognize_with_chaojiying('captcha.png' , '用户名' , '密码' , '软件ID' )print (f'验证码: {code} ' )
阿里云验证码服务 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 from alibabacloud_cr20220701.client import Clientfrom alibabacloud_tea_openapi.models import Config config = Config( access_key_id='你的ID' , access_key_secret='你的密钥' , endpoint='ocr.cn-shanghai.aliyuncs.com' ) client = Client(config) body = bytes (open ('captcha.png' , 'rb' ).read()) response = client.predict_body({ 'image' : body })print (response.body)
四、滑块验证码处理 滑块验证码需要模拟人类滑动轨迹:
1. 基本思路 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 from selenium import webdriverfrom selenium.webdriver.common.by import Byimport timeimport randomdef move_to_slider (driver, slider_element, distance ): """ 移动滑块到目标位置 distance: 需要滑动的距离 """ slider_element.click() tracks = get_slide_tracks(distance) ActionChains(driver).click_and_hold(slider_element).perform() for x in tracks: ActionChains(driver).move_by_offset(x=x, y=random.randint(-2 , 2 )).perform() time.sleep(random.randint(10 , 30 ) / 1000 ) ActionChains(driver).release().perform()def get_slide_tracks (distance ): """ 生成滑动轨迹 模拟人类滑动:先快后慢,带有抖动 """ tracks = [] current = 0 mid = distance * 0.7 t = random.randint(10 , 15 ) while current < distance: if current < mid: move = random.randint(5 , 10 ) else : move = random.randint(2 , 5 ) current += move tracks.append(move) if current < distance: tracks.append(distance - current) return tracks
2. 完整滑块验证处理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom PIL import Imageimport timeimport randomimport requestsimport numpy as npdef get_distance (bg_image_url, slider_image_url ): """计算缺口距离""" bg_data = requests.get(bg_image_url).content slider_data = requests.get(slider_image_url).content with open ('bg.jpg' , 'wb' ) as f: f.write(bg_data) with open ('slider.png' , 'wb' ) as f: f.write(slider_data) bg = Image.open ('bg.jpg' ) slider = Image.open ('slider.png' ) bg_gray = bg.convert('L' ) slider_gray = slider.convert('L' ) bg_array = np.array(bg_gray) slider_array = np.array(slider_gray) min_diff = float ('inf' ) best_x = 0 for x in range (bg_array.shape[1 ] - slider_array.shape[1 ]): diff = np.sum (np.abs (bg_array[:, x:x+slider_array.shape[1 ]] - slider_array)) if diff < min_diff: min_diff = diff best_x = x return best_xdef slide_verify (driver, url ): """处理滑块验证""" driver.get(url) time.sleep(2 ) try : slider = WebDriverWait(driver, 10 ).until( EC.element_to_be_clickable((By.CLASS_NAME, 'slider-btn' )) ) bg_url = driver.find_element(By.CLASS_NAME, 'bg-image' ).get_attribute('src' ) slider_url = driver.find_element(By.CLASS_NAME, 'slider-image' ).get_attribute('src' ) distance = get_distance(bg_url, slider_url) print (f'需要滑动的距离: {distance} ' ) move_to_slider(driver, slider, distance) time.sleep(2 ) if '验证通过' in driver.page_source: print ('✅ 滑块验证通过!' ) return True else : print ('❌ 滑块验证失败' ) return False except Exception as e: print (f'滑块验证出错: {e} ' ) return False
五、点选验证码 点选验证码需要识别图片中的特定元素:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 import requestsfrom selenium import webdriverfrom selenium.webdriver.common.by import Byimport timedef click_points (driver, xpath_template, points ): """ 点击指定坐标 points: [(x1, y1), (x2, y2), ...] """ for i, (x, y) in enumerate (points): canvas = driver.find_element(By.XPATH, xpath_template.format (i)) location = canvas.location size = canvas.size actual_x = location['x' ] + x * size['width' ] actual_y = location['y' ] + y * size['height' ] ActionChains(driver).move_by_offset(actual_x, actual_y).click().perform() time.sleep(0.5 )def recognize_click_captcha (image_path ): """识别点选验证码需要点击的位置""" response = requests.post('http://xxx.com/api' , json={ 'image' : open (image_path, 'rb' ).read(), 'type' : 'click' }) result = response.json() return result.get('points' , [])
六、实战:微博登录 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManagerimport timedef login_weibo (username, password ): """微博登录""" options = Options() options.add_argument('--headless' ) options.add_argument('--disable-blink-features=AutomationControlled' ) driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=options ) try : driver.get('https://weibo.com/login.php' ) time.sleep(3 ) try : pwd_login = driver.find_element(By.XPATH, '//a[@action-type="btn_pwd_login"]' ) pwd_login.click() time.sleep(2 ) except : pass username_input = WebDriverWait(driver, 10 ).until( EC.presence_of_element_located((By.NAME, 'username' )) ) username_input.clear() username_input.send_keys(username) password_input = driver.find_element(By.NAME, 'password' ) password_input.clear() password_input.send_keys(password) login_btn = driver.find_element(By.XPATH, '//button[@type="submit"]' ) login_btn.click() time.sleep(5 ) if '我的首页' in driver.title or '个人中心' in driver.page_source: print ('✅ 微博登录成功!' ) return driver else : print ('❌ 微博登录失败' ) return None except Exception as e: print (f'出错: {e} ' ) return None driver = login_weibo('你的微博账号' , '你的微博密码' )
七、实战:知乎登录 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 def login_zhihu (email, password ): """知乎登录""" driver = webdriver.Chrome(...) try : driver.get('https://www.zhihu.com/signin' ) time.sleep(2 ) driver.find_element(By.XPATH, '//button[@class="SignFlow-tab SignFlow-tab--password"]' ).click() time.sleep(1 ) driver.find_element(By.NAME, 'email' ).send_keys(email) driver.find_element(By.NAME, 'password' ).send_keys(password) driver.find_element(By.XPATH, '//button[@type="submit"]' ).click() time.sleep(5 ) if '首页' in driver.title: print ('✅ 知乎登录成功!' ) return driver else : print ('❌ 知乎登录失败' ) return None finally : pass
八、完整登录管理器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 import requestsimport pickleimport osclass LoginManager : """登录管理器""" def __init__ (self, cookie_file='cookies.pkl' ): self .session = requests.Session() self .cookie_file = cookie_file def save_cookies (self ): """保存Cookies""" with open (self .cookie_file, 'wb' ) as f: pickle.dump(self .session.cookies, f) print (f'✅ Cookies已保存到 {self.cookie_file} ' ) def load_cookies (self ): """加载Cookies""" if os.path.exists(self .cookie_file): with open (self .cookie_file, 'rb' ) as f: self .session.cookies.update(pickle.load(f)) print (f'✅ Cookies已加载' ) return True return False def is_logged_in (self, check_url ): """检查是否已登录""" try : response = self .session.get(check_url, timeout=5 ) return response.status_code == 200 except : return False def login (self, login_url, data, check_url ): """执行登录""" if self .load_cookies(): if self .is_logged_in(check_url): print ('✅ 使用保存的Cookie登录成功!' ) return True response = self .session.post(login_url, data=data) if self .is_logged_in(check_url): self .save_cookies() print ('✅ 登录成功并保存Cookie!' ) return True else : print ('❌ 登录失败' ) return False manager = LoginManager('zhihu_cookies.pkl' )if manager.login( login_url='https://www.zhihu.com/api/v3/oauth/captcha' , data={'email' : 'xxx' , 'password' : 'xxx' }, check_url='https://www.zhihu.com/api/v4/me' ): response = manager.session.get('https://www.zhihu.com/api/v4/me' ) print (response.json())
九、常见问题
问题
解决方法
验证码识别率低
用第三方打码平台
滑块验证失败
优化滑动轨迹、加减速
登录后马上掉线
检查Token过期时间
账号被封
控制访问频率
多次滑动失败
尝试无痕模式
十、写在最后 这篇学会了:
1 2 3 4 5 6 7 ✅ POST 表单登录 ✅ Cookie保存与复用 ✅ Selenium模拟登录 ✅ 图片验证码识别 ✅ 滑块验证码处理 ✅ 第三方打码平台使用 ✅ 登录管理器封装
下篇预告 :Scrapy框架入门、高效分布式爬虫、IP代理池构建。
如果这篇文章对你有帮助,欢迎点赞+在看👍
有问题欢迎留言,我们一起进步!
#AI学习 #Python爬虫 #模拟登录 #验证码 #Selenium