Skip to content

Commit ad954ee

Browse files
committed
refactor: update
1 parent ed91eb4 commit ad954ee

File tree

16 files changed

+557
-4918
lines changed

16 files changed

+557
-4918
lines changed

docs/爬虫进价/01_工程化爬虫开发规范.md

Lines changed: 92 additions & 218 deletions
Large diffs are not rendered by default.

docs/爬虫进价/02_反爬虫对抗基础_请求伪装.md

Lines changed: 95 additions & 333 deletions
Large diffs are not rendered by default.

docs/爬虫进价/03_代理IP的使用与管理.md

Lines changed: 66 additions & 158 deletions
Large diffs are not rendered by default.

docs/爬虫进价/04_Playwright浏览器自动化入门.md

Lines changed: 157 additions & 230 deletions
Large diffs are not rendered by default.

docs/爬虫进价/05_Playwright进阶_反检测与性能优化.md

Lines changed: 60 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -492,15 +492,15 @@ class BrowserManager:
492492

493493
---
494494

495-
## B站 Playwright 反检测实战
495+
## 反检测实战
496496

497-
### B站的反自动化检测机制
497+
### 网站的反自动化检测机制
498498

499-
B站作为大型视频平台,有较为完善的反自动化检测
499+
大型网站通常有较为完善的反自动化检测
500500

501501
```mermaid
502502
flowchart LR
503-
subgraph B站检测机制
503+
subgraph 检测机制
504504
Check1["WebDriver检测"]
505505
Check2["请求头验证"]
506506
Check3["行为分析"]
@@ -509,29 +509,29 @@ flowchart LR
509509
510510
subgraph 检测结果
511511
Pass["正常访问"]
512-
Block412["412风控"]
512+
Block["触发风控"]
513513
Captcha["触发验证码"]
514514
end
515515
516516
Check1 -->|通过| Check2
517-
Check1 -->|失败| Block412
517+
Check1 -->|失败| Block
518518
Check2 -->|通过| Check3
519-
Check2 -->|异常| Block412
519+
Check2 -->|异常| Block
520520
Check3 -->|正常| Pass
521521
Check3 -->|异常| Captcha
522-
Check4 -->|超限| Block412
522+
Check4 -->|超限| Block
523523
524524
style Pass fill:#c8e6c9,stroke:#4caf50
525-
style Block412 fill:#ffcdd2,stroke:#f44336
525+
style Block fill:#ffcdd2,stroke:#f44336
526526
style Captcha fill:#fff3e0,stroke:#ff9800
527527
```
528528

529-
### B站完整反检测配置
529+
### 完整反检测配置
530530

531531
```python
532532
# -*- coding: utf-8 -*-
533533
"""
534-
B站 Playwright 反检测配置
534+
Playwright 反检测配置
535535
"""
536536

537537
import asyncio
@@ -540,8 +540,8 @@ from loguru import logger
540540
from typing import Optional
541541

542542

543-
# B站专用 stealth 脚本
544-
BILIBILI_STEALTH_JS = """
543+
# 通用 stealth 脚本
544+
STEALTH_JS = """
545545
// 隐藏 webdriver 标志
546546
Object.defineProperty(navigator, 'webdriver', {
547547
get: () => undefined
@@ -612,9 +612,9 @@ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
612612
"""
613613

614614

615-
class BilibiliStealthBrowser:
615+
class StealthBrowser:
616616
"""
617-
B站反检测浏览器封装
617+
反检测浏览器封装
618618
619619
特性:
620620
- 自动注入 stealth 脚本
@@ -663,9 +663,9 @@ class BilibiliStealthBrowser:
663663
)
664664

665665
# 注入 stealth 脚本
666-
await self._context.add_init_script(BILIBILI_STEALTH_JS)
666+
await self._context.add_init_script(STEALTH_JS)
667667

668-
logger.info("B站反检测浏览器已启动")
668+
logger.info("反检测浏览器已启动")
669669
return self._context
670670

671671
async def create_optimized_page(self) -> Page:
@@ -702,7 +702,7 @@ class BilibiliStealthBrowser:
702702
viewport={"width": 1920, "height": 1080},
703703
locale="zh-CN"
704704
)
705-
await self._context.add_init_script(BILIBILI_STEALTH_JS)
705+
await self._context.add_init_script(STEALTH_JS)
706706
logger.info(f"Cookie 已从 {path} 加载")
707707

708708
async def stop(self):
@@ -716,17 +716,17 @@ class BilibiliStealthBrowser:
716716
logger.info("浏览器已关闭")
717717

718718

719-
async def test_bilibili_stealth():
720-
"""测试B站反检测效果"""
721-
browser = BilibiliStealthBrowser(headless=True)
719+
async def test_stealth_browser():
720+
"""测试反检测效果(使用 bot.sannysoft.com)"""
721+
browser = StealthBrowser(headless=True)
722722
context = await browser.start()
723723

724724
try:
725725
page = await browser.create_optimized_page()
726726

727-
# 访问B站首页
728-
logger.info("访问B站首页...")
729-
await page.goto("https://www.bilibili.com", wait_until="networkidle")
727+
# 访问 WebDriver 检测网站
728+
logger.info("访问 bot.sannysoft.com 测试反检测效果...")
729+
await page.goto("https://bot.sannysoft.com/", wait_until="networkidle")
730730

731731
# 检查反检测效果
732732
webdriver = await page.evaluate("navigator.webdriver")
@@ -738,29 +738,24 @@ async def test_bilibili_stealth():
738738
logger.info(f" - window.chrome 存在: {chrome}")
739739
logger.info(f" - plugins 数量: {plugins}")
740740

741-
# 等待视频卡片加载
742-
await page.wait_for_selector(".bili-video-card", timeout=10000)
743-
cards = await page.locator(".bili-video-card").count()
744-
logger.info(f"成功加载 {cards} 个视频卡片")
745-
746-
# 截图
747-
await page.screenshot(path="bilibili_stealth_test.png")
748-
logger.info("截图已保存")
741+
# 截图保存测试结果
742+
await page.screenshot(path="stealth_test.png", full_page=True)
743+
logger.info("截图已保存到 stealth_test.png")
749744

750745
finally:
751746
await browser.stop()
752747

753748

754749
if __name__ == "__main__":
755-
asyncio.run(test_bilibili_stealth())
750+
asyncio.run(test_stealth_browser())
756751
```
757752

758-
### B站性能优化配置
753+
### 性能优化爬虫配置
759754

760755
```python
761756
# -*- coding: utf-8 -*-
762757
"""
763-
B站 Playwright 性能优化配置
758+
Playwright 性能优化配置
764759
"""
765760

766761
import asyncio
@@ -769,9 +764,9 @@ from loguru import logger
769764
from typing import Set
770765

771766

772-
class BilibiliOptimizedCrawler:
767+
class OptimizedCrawler:
773768
"""
774-
B站性能优化爬虫
769+
性能优化爬虫
775770
776771
优化策略:
777772
- 禁用图片/字体/CSS加载
@@ -790,9 +785,9 @@ class BilibiliOptimizedCrawler:
790785

791786
# 需要阻止的URL模式
792787
BLOCKED_URL_PATTERNS = [
793-
"**/cm.bilibili.com/**", # 广告
794-
"**/api.bilibili.com/x/web-show/**", # 广告
795-
"**/s1.hdslb.com/bfs/seed/**", # 追踪
788+
"**/analytics**",
789+
"**/tracking**",
790+
"**/ads**",
796791
"**/*.gif",
797792
"**/*.png",
798793
"**/*.jpg",
@@ -816,7 +811,7 @@ class BilibiliOptimizedCrawler:
816811

817812
# 检查URL模式(广告和追踪)
818813
url = request.url
819-
for pattern in ["cm.bilibili.com", "web-show", "tracking"]:
814+
for pattern in ["analytics", "tracking", "ads"]:
820815
if pattern in url:
821816
await route.abort()
822817
return
@@ -842,55 +837,34 @@ class BilibiliOptimizedCrawler:
842837

843838
logger.info("性能优化浏览器已启动")
844839

845-
async def crawl_video_page(self, bvid: str) -> dict:
840+
async def crawl_page(self, url: str) -> dict:
846841
"""
847-
爬取视频详情页
842+
爬取页面
848843
849844
Args:
850-
bvid: 视频BV号
845+
url: 目标URL
851846
852847
Returns:
853-
视频信息
848+
页面信息
854849
"""
855850
page = await self._context.new_page()
856851

857852
try:
858-
url = f"https://www.bilibili.com/video/{bvid}"
859-
logger.info(f"爬取视频: {bvid}")
853+
logger.info(f"爬取页面: {url}")
860854

861855
# 访问页面
862856
await page.goto(url, wait_until="domcontentloaded")
863857

864-
# 等待标题加载
865-
await page.wait_for_selector("h1.video-title", timeout=10000)
866-
867-
# 提取信息
868-
title = await page.locator("h1.video-title").text_content()
869-
870-
# 尝试获取播放量
871-
view_count = "0"
872-
try:
873-
view_el = page.locator(".view-text")
874-
if await view_el.count() > 0:
875-
view_count = await view_el.text_content()
876-
except Exception:
877-
pass
858+
# 获取页面标题
859+
title = await page.title()
878860

879-
# 尝试获取UP主
880-
author = ""
881-
try:
882-
author_el = page.locator(".up-name")
883-
if await author_el.count() > 0:
884-
author = await author_el.text_content()
885-
except Exception:
886-
pass
861+
# 获取页面内容
862+
content = await page.content()
887863

888864
return {
889-
"bvid": bvid,
865+
"url": url,
890866
"title": title.strip() if title else "",
891-
"view_count": view_count.strip() if view_count else "0",
892-
"author": author.strip() if author else "",
893-
"url": url
867+
"content_length": len(content)
894868
}
895869

896870
finally:
@@ -910,21 +884,25 @@ async def benchmark_optimization():
910884
"""性能对比测试"""
911885
import time
912886

913-
# 测试BV号列表
914-
bvids = ["BV1GJ411x7h7", "BV1uT4y1P7CX", "BV1Ys411c7xT"]
887+
# 测试URL列表
888+
urls = [
889+
"https://quotes.toscrape.com/",
890+
"https://quotes.toscrape.com/page/2/",
891+
"https://quotes.toscrape.com/page/3/",
892+
]
915893

916-
crawler = BilibiliOptimizedCrawler(headless=True)
894+
crawler = OptimizedCrawler(headless=True)
917895
await crawler.start()
918896

919897
try:
920898
start = time.time()
921899

922-
for bvid in bvids:
923-
result = await crawler.crawl_video_page(bvid)
924-
logger.info(f"视频: {result['title'][:30]}... | 播放: {result['view_count']}")
900+
for url in urls:
901+
result = await crawler.crawl_page(url)
902+
logger.info(f"页面: {result['title']} | 大小: {result['content_length']} bytes")
925903

926904
elapsed = time.time() - start
927-
logger.info(f"总耗时: {elapsed:.2f}s | 平均: {elapsed/len(bvids):.2f}s/页")
905+
logger.info(f"总耗时: {elapsed:.2f}s | 平均: {elapsed/len(urls):.2f}s/页")
928906

929907
finally:
930908
await crawler.stop()
@@ -939,7 +917,7 @@ if __name__ == "__main__":
939917
```mermaid
940918
flowchart LR
941919
Start["启动浏览器"] --> Inject["注入stealth.js"]
942-
Inject --> Visit["访问B站"]
920+
Inject --> Visit["访问测试网站"]
943921
Visit --> Check{"检测验证"}
944922
945923
Check -->|webdriver=undefined| Pass1["✓ 通过"]
@@ -964,52 +942,7 @@ flowchart LR
964942
3. **CDP 模式**:直接使用 Chrome DevTools Protocol
965943
4. **性能优化**:禁用资源加载、上下文复用、并发管理
966944
5. **异常处理**:页面崩溃恢复、资源清理
967-
6. **B站实战**:专用反检测配置、性能优化爬虫
968-
969-
---
970-
971-
## 与第11章实战项目的关联
972-
973-
本章反检测与性能优化技术在第11章 B站综合实战项目中有核心应用:
974-
975-
| 本章内容 | 第11章对应实现 | 文件位置 |
976-
|---------|--------------|---------|
977-
| stealth.js 配置 | 浏览器初始化脚本 | `tools/stealth.min.js` |
978-
| 反检测浏览器类 | BrowserManager | `tools/browser_manager.py` |
979-
| 性能优化配置 | 资源拦截规则 | `config/bilibili_config.py` |
980-
| Cookie 管理 | 登录态保持 | `login/auth.py` |
981-
982-
```mermaid
983-
graph LR
984-
subgraph 本章知识点
985-
A1["stealth.js"]
986-
A2["反检测配置"]
987-
A3["性能优化"]
988-
end
989-
990-
subgraph 第11章实战应用
991-
B1["扫码登录"]
992-
B2["数据采集"]
993-
B3["批量爬取"]
994-
end
995-
996-
A1 --> B1
997-
A2 --> B2
998-
A3 --> B3
999-
1000-
style A1 fill:#e3f2fd,stroke:#2196f3
1001-
style A2 fill:#e3f2fd,stroke:#2196f3
1002-
style A3 fill:#e3f2fd,stroke:#2196f3
1003-
style B1 fill:#c8e6c9,stroke:#4caf50
1004-
style B2 fill:#c8e6c9,stroke:#4caf50
1005-
style B3 fill:#c8e6c9,stroke:#4caf50
1006-
```
1007-
1008-
**学习建议**
1009-
1010-
1. 本章的 `BILIBILI_STEALTH_JS` 脚本是第11章浏览器自动化的基础
1011-
2. 资源拦截策略直接影响爬虫性能
1012-
3. 建议结合第11章 `tools/browser_manager.py` 学习完整实现
945+
6. **实战演练**:反检测配置、性能优化爬虫
1013946

1014947
---
1015948

0 commit comments

Comments
 (0)