Skip to content

Commit ed91eb4

Browse files
committed
refactor: 代码和文档优化
1 parent a3c8bd9 commit ed91eb4

File tree

44 files changed

+16162
-1580
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+16162
-1580
lines changed

docs/爬虫进价/01_工程化爬虫开发规范.md

Lines changed: 445 additions & 7 deletions
Large diffs are not rendered by default.

docs/爬虫进价/02_反爬虫对抗基础_请求伪装.md

Lines changed: 428 additions & 8 deletions
Large diffs are not rendered by default.

docs/爬虫进价/03_代理IP的使用与管理.md

Lines changed: 493 additions & 15 deletions
Large diffs are not rendered by default.

docs/爬虫进价/04_Playwright浏览器自动化入门.md

Lines changed: 420 additions & 45 deletions
Large diffs are not rendered by default.

docs/爬虫进价/05_Playwright进阶_反检测与性能优化.md

Lines changed: 510 additions & 30 deletions
Large diffs are not rendered by default.

docs/爬虫进价/06_登录认证_Cookie与Session管理.md

Lines changed: 445 additions & 10 deletions
Large diffs are not rendered by default.

docs/爬虫进价/07_登录认证_扫码与短信登录实现.md

Lines changed: 514 additions & 41 deletions
Large diffs are not rendered by default.

docs/爬虫进价/08_验证码识别与处理.md

Lines changed: 344 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,27 +16,22 @@
1616

1717
### 1.2 验证码的工作原理
1818

19-
```
20-
用户 前端 服务器
21-
| | |
22-
| 1.请求验证码 | |
23-
| --------------------> | |
24-
| | 2.请求验证码 |
25-
| | ---------------------> |
26-
| | |
27-
| | 3.返回验证码图片+ID |
28-
| | <--------------------- |
29-
| | |
30-
| 4.显示验证码 | |
31-
| <-------------------- | |
32-
| | |
33-
| 5.输入验证码 | |
34-
| --------------------> | |
35-
| | 6.提交验证 |
36-
| | ---------------------> |
37-
| | |
38-
| | 7.返回验证结果 |
39-
| | <--------------------- |
19+
```mermaid
20+
sequenceDiagram
21+
participant User as 用户
22+
participant Frontend as 前端
23+
participant Server as 服务器
24+
25+
User->>Frontend: 1. 请求验证码
26+
Frontend->>Server: 2. 请求验证码
27+
Server-->>Frontend: 3. 返回验证码图片+ID
28+
Frontend-->>User: 4. 显示验证码
29+
30+
User->>Frontend: 5. 输入验证码
31+
Frontend->>Server: 6. 提交验证
32+
Server-->>Frontend: 7. 返回验证结果
33+
34+
Note over User,Server: 验证通过后继续操作
4035
```
4136

4237
## 二、图片验证码识别
@@ -872,6 +867,333 @@ async def demo_captcha_solving():
872867
await browser.close()
873868
```
874869

870+
## 七、B站验证码处理实战
871+
872+
B站在特定场景下会触发验证码,本节介绍B站验证码的特点和处理方法。
873+
874+
### 7.1 B站验证码触发场景
875+
876+
```mermaid
877+
flowchart TD
878+
request[发起请求] --> check{B站风控检测}
879+
check -->|正常| success[返回数据]
880+
check -->|异常| trigger{触发验证码}
881+
882+
trigger -->|高频请求| slider[滑块验证码]
883+
trigger -->|IP异常| geetest[极验验证码]
884+
trigger -->|敏感操作| click[点选验证码]
885+
886+
slider --> verify{验证}
887+
geetest --> verify
888+
click --> verify
889+
890+
verify -->|成功| success
891+
verify -->|失败| block[临时封禁]
892+
```
893+
894+
### 7.2 B站常见验证码类型
895+
896+
| 场景 | 验证码类型 | 触发条件 | 处理难度 |
897+
|-----|-----------|---------|---------|
898+
| 登录保护 | 滑块验证码 | 异地登录、频繁登录 | ⭐⭐⭐ |
899+
| 接口防护 | 极验验证码 | 请求频率过高 | ⭐⭐⭐⭐ |
900+
| 评论/弹幕 | 点选验证码 | 短时间大量发送 | ⭐⭐⭐⭐⭐ |
901+
| 关注/收藏 | 简单确认 | 批量操作 ||
902+
903+
### 7.3 B站滑块验证码处理
904+
905+
```python
906+
import asyncio
907+
import httpx
908+
from playwright.async_api import async_playwright, Page
909+
from loguru import logger
910+
911+
912+
class BilibiliSliderCaptcha:
913+
"""B站滑块验证码处理器"""
914+
915+
def __init__(self, page: Page):
916+
self.page = page
917+
918+
async def detect_and_solve(self) -> bool:
919+
"""
920+
检测并解决滑块验证码
921+
922+
Returns:
923+
是否成功解决
924+
"""
925+
try:
926+
# 检测是否出现滑块验证码
927+
slider_frame = self.page.frame_locator("iframe[src*='captcha']")
928+
929+
# 等待滑块出现(最多5秒)
930+
try:
931+
await slider_frame.locator(".geetest_slider_button").wait_for(
932+
timeout=5000
933+
)
934+
except Exception:
935+
# 没有验证码,正常情况
936+
return True
937+
938+
logger.info("检测到B站滑块验证码")
939+
940+
# 获取滑块和背景图
941+
bg_element = slider_frame.locator(".geetest_canvas_bg")
942+
slider_element = slider_frame.locator(".geetest_canvas_slice")
943+
944+
bg_bytes = await bg_element.screenshot()
945+
slider_bytes = await slider_element.screenshot()
946+
947+
# 检测缺口位置
948+
gap_x = self._detect_gap(bg_bytes, slider_bytes)
949+
950+
if not gap_x:
951+
logger.error("无法检测缺口位置")
952+
return False
953+
954+
# 执行拖拽
955+
await self._drag_slider(slider_frame, gap_x)
956+
957+
# 等待验证结果
958+
await asyncio.sleep(2)
959+
960+
# 检查是否成功
961+
try:
962+
await slider_frame.locator(".geetest_success").wait_for(
963+
timeout=3000
964+
)
965+
logger.info("B站滑块验证码通过")
966+
return True
967+
except Exception:
968+
logger.warning("B站滑块验证码验证失败")
969+
return False
970+
971+
except Exception as e:
972+
logger.error(f"B站滑块验证码处理异常: {e}")
973+
return False
974+
975+
def _detect_gap(self, bg_bytes: bytes, slider_bytes: bytes) -> int:
976+
"""检测缺口位置"""
977+
import cv2
978+
import numpy as np
979+
980+
bg = cv2.imdecode(np.frombuffer(bg_bytes, np.uint8), cv2.IMREAD_COLOR)
981+
slider = cv2.imdecode(np.frombuffer(slider_bytes, np.uint8), cv2.IMREAD_COLOR)
982+
983+
# 边缘检测
984+
bg_edges = cv2.Canny(cv2.cvtColor(bg, cv2.COLOR_BGR2GRAY), 100, 200)
985+
slider_edges = cv2.Canny(cv2.cvtColor(slider, cv2.COLOR_BGR2GRAY), 100, 200)
986+
987+
# 模板匹配
988+
result = cv2.matchTemplate(bg_edges, slider_edges, cv2.TM_CCOEFF_NORMED)
989+
_, _, _, max_loc = cv2.minMaxLoc(result)
990+
991+
return max_loc[0]
992+
993+
async def _drag_slider(self, frame, distance: int):
994+
"""拖拽滑块"""
995+
import random
996+
997+
slider_btn = frame.locator(".geetest_slider_button")
998+
box = await slider_btn.bounding_box()
999+
1000+
if not box:
1001+
raise Exception("无法获取滑块位置")
1002+
1003+
start_x = box['x'] + box['width'] / 2
1004+
start_y = box['y'] + box['height'] / 2
1005+
1006+
# 生成人类轨迹
1007+
trajectory = self._generate_human_trajectory(distance)
1008+
1009+
await self.page.mouse.move(start_x, start_y)
1010+
await asyncio.sleep(random.uniform(0.1, 0.2))
1011+
1012+
await self.page.mouse.down()
1013+
1014+
for x, y, delay in trajectory:
1015+
await asyncio.sleep(delay)
1016+
await self.page.mouse.move(start_x + x, start_y + y)
1017+
1018+
await asyncio.sleep(random.uniform(0.05, 0.1))
1019+
await self.page.mouse.up()
1020+
1021+
def _generate_human_trajectory(self, distance: int):
1022+
"""生成人类轨迹"""
1023+
import random
1024+
1025+
trajectory = []
1026+
current_x = 0
1027+
steps = random.randint(20, 30)
1028+
1029+
for i in range(steps):
1030+
progress = i / steps
1031+
# 缓动函数
1032+
eased = progress * (2 - progress)
1033+
target_x = int(distance * eased)
1034+
1035+
x = target_x
1036+
y = random.randint(-3, 3)
1037+
delay = random.uniform(0.01, 0.03)
1038+
1039+
trajectory.append((x, y, delay))
1040+
1041+
trajectory.append((distance, 0, 0.05))
1042+
return trajectory
1043+
1044+
1045+
async def bilibili_with_captcha_handling():
1046+
"""带验证码处理的B站访问示例"""
1047+
1048+
async with async_playwright() as p:
1049+
browser = await p.chromium.launch(headless=False)
1050+
context = await browser.new_context()
1051+
page = await context.new_page()
1052+
1053+
# 访问B站
1054+
await page.goto("https://www.bilibili.com")
1055+
1056+
# 模拟一些操作...
1057+
# 如果触发验证码,自动处理
1058+
captcha_handler = BilibiliSliderCaptcha(page)
1059+
1060+
# 在关键操作后检查验证码
1061+
success = await captcha_handler.detect_and_solve()
1062+
1063+
if success:
1064+
print("操作成功,无验证码或验证码已处理")
1065+
else:
1066+
print("验证码处理失败")
1067+
1068+
await browser.close()
1069+
1070+
1071+
if __name__ == "__main__":
1072+
asyncio.run(bilibili_with_captcha_handling())
1073+
```
1074+
1075+
### 7.4 避免触发验证码的策略
1076+
1077+
在B站爬虫中,预防优于处理:
1078+
1079+
```mermaid
1080+
flowchart LR
1081+
subgraph 预防策略
1082+
A[控制请求频率] --> B[模拟真实行为]
1083+
B --> C[使用登录态]
1084+
C --> D[IP轮换]
1085+
end
1086+
1087+
subgraph B站建议配置
1088+
E[每分钟<30请求]
1089+
F[随机延迟2-5秒]
1090+
G[保持Cookie有效]
1091+
H[高匿代理池]
1092+
end
1093+
1094+
A --> E
1095+
B --> F
1096+
C --> G
1097+
D --> H
1098+
```
1099+
1100+
```python
1101+
import asyncio
1102+
import random
1103+
from typing import Optional
1104+
1105+
1106+
class BilibiliRateLimiter:
1107+
"""B站请求频率控制器"""
1108+
1109+
def __init__(
1110+
self,
1111+
requests_per_minute: int = 20,
1112+
min_delay: float = 2.0,
1113+
max_delay: float = 5.0
1114+
):
1115+
self.requests_per_minute = requests_per_minute
1116+
self.min_delay = min_delay
1117+
self.max_delay = max_delay
1118+
self._last_request_time: Optional[float] = None
1119+
self._request_count = 0
1120+
self._minute_start: Optional[float] = None
1121+
1122+
async def wait(self):
1123+
"""等待直到可以发送下一个请求"""
1124+
now = asyncio.get_event_loop().time()
1125+
1126+
# 重置分钟计数
1127+
if self._minute_start is None or now - self._minute_start > 60:
1128+
self._minute_start = now
1129+
self._request_count = 0
1130+
1131+
# 检查是否超过频率限制
1132+
if self._request_count >= self.requests_per_minute:
1133+
wait_time = 60 - (now - self._minute_start)
1134+
if wait_time > 0:
1135+
await asyncio.sleep(wait_time)
1136+
self._minute_start = asyncio.get_event_loop().time()
1137+
self._request_count = 0
1138+
1139+
# 随机延迟
1140+
if self._last_request_time:
1141+
elapsed = now - self._last_request_time
1142+
if elapsed < self.min_delay:
1143+
delay = random.uniform(self.min_delay, self.max_delay)
1144+
await asyncio.sleep(delay - elapsed)
1145+
1146+
self._last_request_time = asyncio.get_event_loop().time()
1147+
self._request_count += 1
1148+
1149+
1150+
# 使用示例
1151+
rate_limiter = BilibiliRateLimiter(
1152+
requests_per_minute=20,
1153+
min_delay=2.0,
1154+
max_delay=5.0
1155+
)
1156+
1157+
async def safe_bilibili_request(client, url):
1158+
"""安全的B站请求(带频率控制)"""
1159+
await rate_limiter.wait()
1160+
return await client.get(url)
1161+
```
1162+
1163+
---
1164+
1165+
## 八、与第11章的关联
1166+
1167+
本章介绍的验证码处理技术在第11章综合实战项目中有实际应用场景:
1168+
1169+
### 代码位置
1170+
- **验证码检测**`源代码/爬虫进阶/11_进阶综合实战项目/tools/captcha.py`
1171+
- **频率控制**`源代码/爬虫进阶/11_进阶综合实战项目/client/bilibili_client.py`
1172+
1173+
### 技术要点对应
1174+
| 本章内容 | 第11章实现 |
1175+
|---------|-----------|
1176+
| `OCRCaptchaSolver` | `tools/captcha.py` 中的识别模块 |
1177+
| `SliderCaptchaSolver` | 滑块验证码处理逻辑 |
1178+
| `BilibiliRateLimiter` | `client/` 中的请求频率控制 |
1179+
| 人类轨迹生成 | 防检测行为模拟 |
1180+
1181+
### 学习路径
1182+
1183+
```mermaid
1184+
graph LR
1185+
A[第07章<br/>扫码登录] --> B[第08章<br/>验证码处理]
1186+
B --> C[第09章<br/>数据清洗]
1187+
C --> D[第11章<br/>综合实战]
1188+
1189+
style B fill:#e1f5fe
1190+
style D fill:#fff9c4
1191+
```
1192+
1193+
掌握本章的验证码处理技术后,你能够处理B站爬虫中可能遇到的各类验证码场景,为第11章的综合项目打下基础。
1194+
1195+
---
1196+
8751197
## 本章小结
8761198

8771199
本章介绍了验证码识别与处理的核心技术:
@@ -881,6 +1203,7 @@ async def demo_captcha_solving():
8811203
3. **滑块处理**:缺口检测和人类轨迹模拟
8821204
4. **打码平台**:第三方服务的接入和成本控制
8831205
5. **合规考虑**:法律边界和替代方案
1206+
6. **B站实战**:B站验证码触发场景和处理策略
8841207

8851208
验证码处理是爬虫进阶的重要技能,但务必在合法合规的前提下使用。
8861209

0 commit comments

Comments
 (0)