@@ -492,15 +492,15 @@ class BrowserManager:
492492
493493---
494494
495- ## B站 Playwright 反检测实战
495+ ## 反检测实战
496496
497- ### B站的反自动化检测机制
497+ ### 网站的反自动化检测机制
498498
499- B站作为大型视频平台,有较为完善的反自动化检测 :
499+ 大型网站通常有较为完善的反自动化检测 :
500500
501501``` mermaid
502502flowchart LR
503- subgraph B站检测机制
503+ subgraph 检测机制
504504 Check1["WebDriver检测"]
505505 Check2["请求头验证"]
506506 Check3["行为分析"]
@@ -509,29 +509,29 @@ flowchart LR
509509
510510 subgraph 检测结果
511511 Pass["正常访问"]
512- Block412["412风控 "]
512+ Block["触发风控 "]
513513 Captcha["触发验证码"]
514514 end
515515
516516 Check1 -->|通过| Check2
517- Check1 -->|失败| Block412
517+ Check1 -->|失败| Block
518518 Check2 -->|通过| Check3
519- Check2 -->|异常| Block412
519+ Check2 -->|异常| Block
520520 Check3 -->|正常| Pass
521521 Check3 -->|异常| Captcha
522- Check4 -->|超限| Block412
522+ Check4 -->|超限| Block
523523
524524 style Pass fill:#c8e6c9,stroke:#4caf50
525- style Block412 fill:#ffcdd2,stroke:#f44336
525+ style Block fill:#ffcdd2,stroke:#f44336
526526 style Captcha fill:#fff3e0,stroke:#ff9800
527527```
528528
529- ### B站完整反检测配置
529+ ### 完整反检测配置
530530
531531``` python
532532# -*- coding: utf-8 -*-
533533"""
534- B站 Playwright 反检测配置
534+ Playwright 反检测配置
535535"""
536536
537537import asyncio
@@ -540,8 +540,8 @@ from loguru import logger
540540from typing import Optional
541541
542542
543- # B站专用 stealth 脚本
544- BILIBILI_STEALTH_JS = """
543+ # 通用 stealth 脚本
544+ STEALTH_JS = """
545545// 隐藏 webdriver 标志
546546Object.defineProperty(navigator, 'webdriver', {
547547 get: () => undefined
@@ -612,9 +612,9 @@ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
612612"""
613613
614614
615- class BilibiliStealthBrowser :
615+ class StealthBrowser :
616616 """
617- B站反检测浏览器封装
617+ 反检测浏览器封装
618618
619619 特性:
620620 - 自动注入 stealth 脚本
@@ -663,9 +663,9 @@ class BilibiliStealthBrowser:
663663 )
664664
665665 # 注入 stealth 脚本
666- await self ._context.add_init_script(BILIBILI_STEALTH_JS )
666+ await self ._context.add_init_script(STEALTH_JS )
667667
668- logger.info(" B站反检测浏览器已启动 " )
668+ logger.info(" 反检测浏览器已启动 " )
669669 return self ._context
670670
671671 async def create_optimized_page (self ) -> Page:
@@ -702,7 +702,7 @@ class BilibiliStealthBrowser:
702702 viewport = {" width" : 1920 , " height" : 1080 },
703703 locale = " zh-CN"
704704 )
705- await self ._context.add_init_script(BILIBILI_STEALTH_JS )
705+ await self ._context.add_init_script(STEALTH_JS )
706706 logger.info(f " Cookie 已从 { path} 加载 " )
707707
708708 async def stop (self ):
@@ -716,17 +716,17 @@ class BilibiliStealthBrowser:
716716 logger.info(" 浏览器已关闭" )
717717
718718
719- async def test_bilibili_stealth ():
720- """ 测试B站反检测效果 """
721- browser = BilibiliStealthBrowser (headless = True )
719+ async def test_stealth_browser ():
720+ """ 测试反检测效果(使用 bot.sannysoft.com) """
721+ browser = StealthBrowser (headless = True )
722722 context = await browser.start()
723723
724724 try :
725725 page = await browser.create_optimized_page()
726726
727- # 访问B站首页
728- logger.info(" 访问B站首页 ..." )
729- await page.goto(" https://www.bilibili .com" , wait_until = " networkidle" )
727+ # 访问 WebDriver 检测网站
728+ logger.info(" 访问 bot.sannysoft.com 测试反检测效果 ..." )
729+ await page.goto(" https://bot.sannysoft .com/ " , wait_until = " networkidle" )
730730
731731 # 检查反检测效果
732732 webdriver = await page.evaluate(" navigator.webdriver" )
@@ -738,29 +738,24 @@ async def test_bilibili_stealth():
738738 logger.info(f " - window.chrome 存在: { chrome} " )
739739 logger.info(f " - plugins 数量: { plugins} " )
740740
741- # 等待视频卡片加载
742- await page.wait_for_selector(" .bili-video-card" , timeout = 10000 )
743- cards = await page.locator(" .bili-video-card" ).count()
744- logger.info(f " 成功加载 { cards} 个视频卡片 " )
745-
746- # 截图
747- await page.screenshot(path = " bilibili_stealth_test.png" )
748- logger.info(" 截图已保存" )
741+ # 截图保存测试结果
742+ await page.screenshot(path = " stealth_test.png" , full_page = True )
743+ logger.info(" 截图已保存到 stealth_test.png" )
749744
750745 finally :
751746 await browser.stop()
752747
753748
754749if __name__ == " __main__" :
755- asyncio.run(test_bilibili_stealth ())
750+ asyncio.run(test_stealth_browser ())
756751```
757752
758- ### B站性能优化配置
753+ ### 性能优化爬虫配置
759754
760755``` python
761756# -*- coding: utf-8 -*-
762757"""
763- B站 Playwright 性能优化配置
758+ Playwright 性能优化配置
764759"""
765760
766761import asyncio
@@ -769,9 +764,9 @@ from loguru import logger
769764from typing import Set
770765
771766
772- class BilibiliOptimizedCrawler :
767+ class OptimizedCrawler :
773768 """
774- B站性能优化爬虫
769+ 性能优化爬虫
775770
776771 优化策略:
777772 - 禁用图片/字体/CSS加载
@@ -790,9 +785,9 @@ class BilibiliOptimizedCrawler:
790785
791786 # 需要阻止的URL模式
792787 BLOCKED_URL_PATTERNS = [
793- " **/cm.bilibili.com/ **" , # 广告
794- " **/api.bilibili.com/x/web-show/ **" , # 广告
795- " **/s1.hdslb.com/bfs/seed/ **" , # 追踪
788+ " **/analytics **" ,
789+ " **/tracking **" ,
790+ " **/ads **" ,
796791 " **/*.gif" ,
797792 " **/*.png" ,
798793 " **/*.jpg" ,
@@ -816,7 +811,7 @@ class BilibiliOptimizedCrawler:
816811
817812 # 检查URL模式(广告和追踪)
818813 url = request.url
819- for pattern in [" cm.bilibili.com " , " web-show " , " tracking " ]:
814+ for pattern in [" analytics " , " tracking " , " ads " ]:
820815 if pattern in url:
821816 await route.abort()
822817 return
@@ -842,55 +837,34 @@ class BilibiliOptimizedCrawler:
842837
843838 logger.info(" 性能优化浏览器已启动" )
844839
845- async def crawl_video_page (self , bvid : str ) -> dict :
840+ async def crawl_page (self , url : str ) -> dict :
846841 """
847- 爬取视频详情页
842+ 爬取页面
848843
849844 Args:
850- bvid: 视频BV号
845+ url: 目标URL
851846
852847 Returns:
853- 视频信息
848+ 页面信息
854849 """
855850 page = await self ._context.new_page()
856851
857852 try :
858- url = f " https://www.bilibili.com/video/ { bvid} "
859- logger.info(f " 爬取视频: { bvid} " )
853+ logger.info(f " 爬取页面: { url} " )
860854
861855 # 访问页面
862856 await page.goto(url, wait_until = " domcontentloaded" )
863857
864- # 等待标题加载
865- await page.wait_for_selector(" h1.video-title" , timeout = 10000 )
866-
867- # 提取信息
868- title = await page.locator(" h1.video-title" ).text_content()
869-
870- # 尝试获取播放量
871- view_count = " 0"
872- try :
873- view_el = page.locator(" .view-text" )
874- if await view_el.count() > 0 :
875- view_count = await view_el.text_content()
876- except Exception :
877- pass
858+ # 获取页面标题
859+ title = await page.title()
878860
879- # 尝试获取UP主
880- author = " "
881- try :
882- author_el = page.locator(" .up-name" )
883- if await author_el.count() > 0 :
884- author = await author_el.text_content()
885- except Exception :
886- pass
861+ # 获取页面内容
862+ content = await page.content()
887863
888864 return {
889- " bvid " : bvid ,
865+ " url " : url ,
890866 " title" : title.strip() if title else " " ,
891- " view_count" : view_count.strip() if view_count else " 0" ,
892- " author" : author.strip() if author else " " ,
893- " url" : url
867+ " content_length" : len (content)
894868 }
895869
896870 finally :
@@ -910,21 +884,25 @@ async def benchmark_optimization():
910884 """ 性能对比测试"""
911885 import time
912886
913- # 测试BV号列表
914- bvids = [" BV1GJ411x7h7" , " BV1uT4y1P7CX" , " BV1Ys411c7xT" ]
887+ # 测试URL列表
888+ urls = [
889+ " https://quotes.toscrape.com/" ,
890+ " https://quotes.toscrape.com/page/2/" ,
891+ " https://quotes.toscrape.com/page/3/" ,
892+ ]
915893
916- crawler = BilibiliOptimizedCrawler (headless = True )
894+ crawler = OptimizedCrawler (headless = True )
917895 await crawler.start()
918896
919897 try :
920898 start = time.time()
921899
922- for bvid in bvids :
923- result = await crawler.crawl_video_page(bvid )
924- logger.info(f " 视频 : { result[' title' ][: 30 ] } ... | 播放 : { result[' view_count ' ]} " )
900+ for url in urls :
901+ result = await crawler.crawl_page(url )
902+ logger.info(f " 页面 : { result[' title' ]} | 大小 : { result[' content_length ' ]} bytes " )
925903
926904 elapsed = time.time() - start
927- logger.info(f " 总耗时: { elapsed:.2f } s | 平均: { elapsed/ len (bvids ):.2f } s/页 " )
905+ logger.info(f " 总耗时: { elapsed:.2f } s | 平均: { elapsed/ len (urls ):.2f } s/页 " )
928906
929907 finally :
930908 await crawler.stop()
@@ -939,7 +917,7 @@ if __name__ == "__main__":
939917``` mermaid
940918flowchart LR
941919 Start["启动浏览器"] --> Inject["注入stealth.js"]
942- Inject --> Visit["访问B站 "]
920+ Inject --> Visit["访问测试网站 "]
943921 Visit --> Check{"检测验证"}
944922
945923 Check -->|webdriver=undefined| Pass1["✓ 通过"]
@@ -964,52 +942,7 @@ flowchart LR
9649423 . ** CDP 模式** :直接使用 Chrome DevTools Protocol
9659434 . ** 性能优化** :禁用资源加载、上下文复用、并发管理
9669445 . ** 异常处理** :页面崩溃恢复、资源清理
967- 6 . ** B站实战** :专用反检测配置、性能优化爬虫
968-
969- ---
970-
971- ## 与第11章实战项目的关联
972-
973- 本章反检测与性能优化技术在第11章 B站综合实战项目中有核心应用:
974-
975- | 本章内容 | 第11章对应实现 | 文件位置 |
976- | ---------| --------------| ---------|
977- | stealth.js 配置 | 浏览器初始化脚本 | ` tools/stealth.min.js ` |
978- | 反检测浏览器类 | BrowserManager | ` tools/browser_manager.py ` |
979- | 性能优化配置 | 资源拦截规则 | ` config/bilibili_config.py ` |
980- | Cookie 管理 | 登录态保持 | ` login/auth.py ` |
981-
982- ``` mermaid
983- graph LR
984- subgraph 本章知识点
985- A1["stealth.js"]
986- A2["反检测配置"]
987- A3["性能优化"]
988- end
989-
990- subgraph 第11章实战应用
991- B1["扫码登录"]
992- B2["数据采集"]
993- B3["批量爬取"]
994- end
995-
996- A1 --> B1
997- A2 --> B2
998- A3 --> B3
999-
1000- style A1 fill:#e3f2fd,stroke:#2196f3
1001- style A2 fill:#e3f2fd,stroke:#2196f3
1002- style A3 fill:#e3f2fd,stroke:#2196f3
1003- style B1 fill:#c8e6c9,stroke:#4caf50
1004- style B2 fill:#c8e6c9,stroke:#4caf50
1005- style B3 fill:#c8e6c9,stroke:#4caf50
1006- ```
1007-
1008- ** 学习建议** :
1009-
1010- 1 . 本章的 ` BILIBILI_STEALTH_JS ` 脚本是第11章浏览器自动化的基础
1011- 2 . 资源拦截策略直接影响爬虫性能
1012- 3 . 建议结合第11章 ` tools/browser_manager.py ` 学习完整实现
945+ 6 . ** 实战演练** :反检测配置、性能优化爬虫
1013946
1014947---
1015948
0 commit comments