1616
1717### 1.2 验证码的工作原理
1818
19- ```
20- 用户 前端 服务器
21- | | |
22- | 1.请求验证码 | |
23- | --------------------> | |
24- | | 2.请求验证码 |
25- | | ---------------------> |
26- | | |
27- | | 3.返回验证码图片+ID |
28- | | <--------------------- |
29- | | |
30- | 4.显示验证码 | |
31- | <-------------------- | |
32- | | |
33- | 5.输入验证码 | |
34- | --------------------> | |
35- | | 6.提交验证 |
36- | | ---------------------> |
37- | | |
38- | | 7.返回验证结果 |
39- | | <--------------------- |
19+ ``` mermaid
20+ sequenceDiagram
21+ participant User as 用户
22+ participant Frontend as 前端
23+ participant Server as 服务器
24+
25+ User->>Frontend: 1. 请求验证码
26+ Frontend->>Server: 2. 请求验证码
27+ Server-->>Frontend: 3. 返回验证码图片+ID
28+ Frontend-->>User: 4. 显示验证码
29+
30+ User->>Frontend: 5. 输入验证码
31+ Frontend->>Server: 6. 提交验证
32+ Server-->>Frontend: 7. 返回验证结果
33+
34+ Note over User,Server: 验证通过后继续操作
4035```
4136
4237## 二、图片验证码识别
@@ -872,6 +867,333 @@ async def demo_captcha_solving():
872867 await browser.close()
873868```
874869
870+ ## 七、B站验证码处理实战
871+
872+ B站在特定场景下会触发验证码,本节介绍B站验证码的特点和处理方法。
873+
874+ ### 7.1 B站验证码触发场景
875+
876+ ``` mermaid
877+ flowchart TD
878+ request[发起请求] --> check{B站风控检测}
879+ check -->|正常| success[返回数据]
880+ check -->|异常| trigger{触发验证码}
881+
882+ trigger -->|高频请求| slider[滑块验证码]
883+ trigger -->|IP异常| geetest[极验验证码]
884+ trigger -->|敏感操作| click[点选验证码]
885+
886+ slider --> verify{验证}
887+ geetest --> verify
888+ click --> verify
889+
890+ verify -->|成功| success
891+ verify -->|失败| block[临时封禁]
892+ ```
893+
894+ ### 7.2 B站常见验证码类型
895+
896+ | 场景 | 验证码类型 | 触发条件 | 处理难度 |
897+ | -----| -----------| ---------| ---------|
898+ | 登录保护 | 滑块验证码 | 异地登录、频繁登录 | ⭐⭐⭐ |
899+ | 接口防护 | 极验验证码 | 请求频率过高 | ⭐⭐⭐⭐ |
900+ | 评论/弹幕 | 点选验证码 | 短时间大量发送 | ⭐⭐⭐⭐⭐ |
901+ | 关注/收藏 | 简单确认 | 批量操作 | ⭐ |
902+
903+ ### 7.3 B站滑块验证码处理
904+
905+ ``` python
906+ import asyncio
907+ import httpx
908+ from playwright.async_api import async_playwright, Page
909+ from loguru import logger
910+
911+
912+ class BilibiliSliderCaptcha :
913+ """ B站滑块验证码处理器"""
914+
915+ def __init__ (self , page : Page):
916+ self .page = page
917+
918+ async def detect_and_solve (self ) -> bool :
919+ """
920+ 检测并解决滑块验证码
921+
922+ Returns:
923+ 是否成功解决
924+ """
925+ try :
926+ # 检测是否出现滑块验证码
927+ slider_frame = self .page.frame_locator(" iframe[src*='captcha']" )
928+
929+ # 等待滑块出现(最多5秒)
930+ try :
931+ await slider_frame.locator(" .geetest_slider_button" ).wait_for(
932+ timeout = 5000
933+ )
934+ except Exception :
935+ # 没有验证码,正常情况
936+ return True
937+
938+ logger.info(" 检测到B站滑块验证码" )
939+
940+ # 获取滑块和背景图
941+ bg_element = slider_frame.locator(" .geetest_canvas_bg" )
942+ slider_element = slider_frame.locator(" .geetest_canvas_slice" )
943+
944+ bg_bytes = await bg_element.screenshot()
945+ slider_bytes = await slider_element.screenshot()
946+
947+ # 检测缺口位置
948+ gap_x = self ._detect_gap(bg_bytes, slider_bytes)
949+
950+ if not gap_x:
951+ logger.error(" 无法检测缺口位置" )
952+ return False
953+
954+ # 执行拖拽
955+ await self ._drag_slider(slider_frame, gap_x)
956+
957+ # 等待验证结果
958+ await asyncio.sleep(2 )
959+
960+ # 检查是否成功
961+ try :
962+ await slider_frame.locator(" .geetest_success" ).wait_for(
963+ timeout = 3000
964+ )
965+ logger.info(" B站滑块验证码通过" )
966+ return True
967+ except Exception :
968+ logger.warning(" B站滑块验证码验证失败" )
969+ return False
970+
971+ except Exception as e:
972+ logger.error(f " B站滑块验证码处理异常: { e} " )
973+ return False
974+
975+ def _detect_gap (self , bg_bytes : bytes , slider_bytes : bytes ) -> int :
976+ """ 检测缺口位置"""
977+ import cv2
978+ import numpy as np
979+
980+ bg = cv2.imdecode(np.frombuffer(bg_bytes, np.uint8), cv2.IMREAD_COLOR )
981+ slider = cv2.imdecode(np.frombuffer(slider_bytes, np.uint8), cv2.IMREAD_COLOR )
982+
983+ # 边缘检测
984+ bg_edges = cv2.Canny(cv2.cvtColor(bg, cv2.COLOR_BGR2GRAY ), 100 , 200 )
985+ slider_edges = cv2.Canny(cv2.cvtColor(slider, cv2.COLOR_BGR2GRAY ), 100 , 200 )
986+
987+ # 模板匹配
988+ result = cv2.matchTemplate(bg_edges, slider_edges, cv2.TM_CCOEFF_NORMED )
989+ _, _, _, max_loc = cv2.minMaxLoc(result)
990+
991+ return max_loc[0 ]
992+
993+ async def _drag_slider (self , frame , distance : int ):
994+ """ 拖拽滑块"""
995+ import random
996+
997+ slider_btn = frame.locator(" .geetest_slider_button" )
998+ box = await slider_btn.bounding_box()
999+
1000+ if not box:
1001+ raise Exception (" 无法获取滑块位置" )
1002+
1003+ start_x = box[' x' ] + box[' width' ] / 2
1004+ start_y = box[' y' ] + box[' height' ] / 2
1005+
1006+ # 生成人类轨迹
1007+ trajectory = self ._generate_human_trajectory(distance)
1008+
1009+ await self .page.mouse.move(start_x, start_y)
1010+ await asyncio.sleep(random.uniform(0.1 , 0.2 ))
1011+
1012+ await self .page.mouse.down()
1013+
1014+ for x, y, delay in trajectory:
1015+ await asyncio.sleep(delay)
1016+ await self .page.mouse.move(start_x + x, start_y + y)
1017+
1018+ await asyncio.sleep(random.uniform(0.05 , 0.1 ))
1019+ await self .page.mouse.up()
1020+
1021+ def _generate_human_trajectory (self , distance : int ):
1022+ """ 生成人类轨迹"""
1023+ import random
1024+
1025+ trajectory = []
1026+ current_x = 0
1027+ steps = random.randint(20 , 30 )
1028+
1029+ for i in range (steps):
1030+ progress = i / steps
1031+ # 缓动函数
1032+ eased = progress * (2 - progress)
1033+ target_x = int (distance * eased)
1034+
1035+ x = target_x
1036+ y = random.randint(- 3 , 3 )
1037+ delay = random.uniform(0.01 , 0.03 )
1038+
1039+ trajectory.append((x, y, delay))
1040+
1041+ trajectory.append((distance, 0 , 0.05 ))
1042+ return trajectory
1043+
1044+
1045+ async def bilibili_with_captcha_handling ():
1046+ """ 带验证码处理的B站访问示例"""
1047+
1048+ async with async_playwright() as p:
1049+ browser = await p.chromium.launch(headless = False )
1050+ context = await browser.new_context()
1051+ page = await context.new_page()
1052+
1053+ # 访问B站
1054+ await page.goto(" https://www.bilibili.com" )
1055+
1056+ # 模拟一些操作...
1057+ # 如果触发验证码,自动处理
1058+ captcha_handler = BilibiliSliderCaptcha(page)
1059+
1060+ # 在关键操作后检查验证码
1061+ success = await captcha_handler.detect_and_solve()
1062+
1063+ if success:
1064+ print (" 操作成功,无验证码或验证码已处理" )
1065+ else :
1066+ print (" 验证码处理失败" )
1067+
1068+ await browser.close()
1069+
1070+
1071+ if __name__ == " __main__" :
1072+ asyncio.run(bilibili_with_captcha_handling())
1073+ ```
1074+
1075+ ### 7.4 避免触发验证码的策略
1076+
1077+ 在B站爬虫中,预防优于处理:
1078+
1079+ ``` mermaid
1080+ flowchart LR
1081+ subgraph 预防策略
1082+ A[控制请求频率] --> B[模拟真实行为]
1083+ B --> C[使用登录态]
1084+ C --> D[IP轮换]
1085+ end
1086+
1087+ subgraph B站建议配置
1088+ E[每分钟<30请求]
1089+ F[随机延迟2-5秒]
1090+ G[保持Cookie有效]
1091+ H[高匿代理池]
1092+ end
1093+
1094+ A --> E
1095+ B --> F
1096+ C --> G
1097+ D --> H
1098+ ```
1099+
1100+ ``` python
1101+ import asyncio
1102+ import random
1103+ from typing import Optional
1104+
1105+
1106+ class BilibiliRateLimiter :
1107+ """ B站请求频率控制器"""
1108+
1109+ def __init__ (
1110+ self ,
1111+ requests_per_minute : int = 20 ,
1112+ min_delay : float = 2.0 ,
1113+ max_delay : float = 5.0
1114+ ):
1115+ self .requests_per_minute = requests_per_minute
1116+ self .min_delay = min_delay
1117+ self .max_delay = max_delay
1118+ self ._last_request_time: Optional[float ] = None
1119+ self ._request_count = 0
1120+ self ._minute_start: Optional[float ] = None
1121+
1122+ async def wait (self ):
1123+ """ 等待直到可以发送下一个请求"""
1124+ now = asyncio.get_event_loop().time()
1125+
1126+ # 重置分钟计数
1127+ if self ._minute_start is None or now - self ._minute_start > 60 :
1128+ self ._minute_start = now
1129+ self ._request_count = 0
1130+
1131+ # 检查是否超过频率限制
1132+ if self ._request_count >= self .requests_per_minute:
1133+ wait_time = 60 - (now - self ._minute_start)
1134+ if wait_time > 0 :
1135+ await asyncio.sleep(wait_time)
1136+ self ._minute_start = asyncio.get_event_loop().time()
1137+ self ._request_count = 0
1138+
1139+ # 随机延迟
1140+ if self ._last_request_time:
1141+ elapsed = now - self ._last_request_time
1142+ if elapsed < self .min_delay:
1143+ delay = random.uniform(self .min_delay, self .max_delay)
1144+ await asyncio.sleep(delay - elapsed)
1145+
1146+ self ._last_request_time = asyncio.get_event_loop().time()
1147+ self ._request_count += 1
1148+
1149+
1150+ # 使用示例
1151+ rate_limiter = BilibiliRateLimiter(
1152+ requests_per_minute = 20 ,
1153+ min_delay = 2.0 ,
1154+ max_delay = 5.0
1155+ )
1156+
1157+ async def safe_bilibili_request (client , url ):
1158+ """ 安全的B站请求(带频率控制)"""
1159+ await rate_limiter.wait()
1160+ return await client.get(url)
1161+ ```
1162+
1163+ ---
1164+
1165+ ## 八、与第11章的关联
1166+
1167+ 本章介绍的验证码处理技术在第11章综合实战项目中有实际应用场景:
1168+
1169+ ### 代码位置
1170+ - ** 验证码检测** :` 源代码/爬虫进阶/11_进阶综合实战项目/tools/captcha.py `
1171+ - ** 频率控制** :` 源代码/爬虫进阶/11_进阶综合实战项目/client/bilibili_client.py `
1172+
1173+ ### 技术要点对应
1174+ | 本章内容 | 第11章实现 |
1175+ | ---------| -----------|
1176+ | ` OCRCaptchaSolver ` | ` tools/captcha.py ` 中的识别模块 |
1177+ | ` SliderCaptchaSolver ` | 滑块验证码处理逻辑 |
1178+ | ` BilibiliRateLimiter ` | ` client/ ` 中的请求频率控制 |
1179+ | 人类轨迹生成 | 防检测行为模拟 |
1180+
1181+ ### 学习路径
1182+
1183+ ``` mermaid
1184+ graph LR
1185+ A[第07章<br/>扫码登录] --> B[第08章<br/>验证码处理]
1186+ B --> C[第09章<br/>数据清洗]
1187+ C --> D[第11章<br/>综合实战]
1188+
1189+ style B fill:#e1f5fe
1190+ style D fill:#fff9c4
1191+ ```
1192+
1193+ 掌握本章的验证码处理技术后,你能够处理B站爬虫中可能遇到的各类验证码场景,为第11章的综合项目打下基础。
1194+
1195+ ---
1196+
8751197## 本章小结
8761198
8771199本章介绍了验证码识别与处理的核心技术:
@@ -881,6 +1203,7 @@ async def demo_captcha_solving():
88112033 . ** 滑块处理** :缺口检测和人类轨迹模拟
88212044 . ** 打码平台** :第三方服务的接入和成本控制
88312055 . ** 合规考虑** :法律边界和替代方案
1206+ 6 . ** B站实战** :B站验证码触发场景和处理策略
8841207
8851208验证码处理是爬虫进阶的重要技能,但务必在合法合规的前提下使用。
8861209
0 commit comments