99from typing import Optional
1010from playwright .sync_api import sync_playwright , BrowserContext , Page , Playwright
1111
12+ # Import stealth for bot evasion (optional - graceful fallback if not available)
13+ try :
14+ from playwright_stealth import stealth_sync
15+ STEALTH_AVAILABLE = True
16+ except ImportError :
17+ STEALTH_AVAILABLE = False
18+
1219
1320class SentienceBrowser :
1421 """Main browser session with Sentience extension loaded"""
@@ -25,13 +32,19 @@ def __init__(
2532 Args:
2633 api_key: Optional API key for server-side processing (Pro/Enterprise tiers)
2734 If None, uses free tier (local extension only)
28- api_url: Optional server URL for API calls (defaults to https://api.sentienceapi.com)
35+ api_url: Server URL for API calls (defaults to https://api.sentienceapi.com if api_key provided )
2936 If None and api_key is provided, uses default URL
37+ If None and no api_key, uses free tier (local extension only)
3038 If 'local' or Docker sidecar URL, uses Enterprise tier
3139 headless: Whether to run in headless mode
3240 """
3341 self .api_key = api_key
34- self .api_url = api_url or ("https://api.sentienceapi.com" if api_key else None )
42+ # Only set api_url if api_key is provided, otherwise None (free tier)
43+ # Default to https://api.sentienceapi.com if api_key is provided but api_url is not
44+ if api_key :
45+ self .api_url = api_url or "https://api.sentienceapi.com"
46+ else :
47+ self .api_url = None
3548 self .headless = headless
3649 self .playwright : Optional [Playwright ] = None
3750 self .context : Optional [BrowserContext ] = None
@@ -85,15 +98,58 @@ def start(self) -> None:
8598 # Launch Playwright
8699 self .playwright = sync_playwright ().start ()
87100
88- # Create persistent context with extension
89- self .context = self .playwright .chromium .launch_persistent_context (
90- user_data_dir = tempfile .mkdtemp (prefix = "sentience-profile-" ),
91- headless = self .headless ,
92- args = [
93- f"--load-extension={ temp_dir } " ,
94- f"--disable-extensions-except={ temp_dir } " ,
95- ],
96- )
101+ # Stealth arguments for bot evasion
102+ stealth_args = [
103+ f"--load-extension={ temp_dir } " ,
104+ f"--disable-extensions-except={ temp_dir } " ,
105+ "--disable-blink-features=AutomationControlled" , # Hide automation indicators
106+ "--no-sandbox" , # Required for some environments
107+ "--disable-infobars" , # Hide "Chrome is being controlled" message
108+ ]
109+
110+ # Realistic viewport and user-agent for better evasion
111+ viewport_config = {"width" : 1920 , "height" : 1080 }
112+ user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
113+
114+ # Launch browser with extension
115+ # Note: channel="chrome" (system Chrome) has known issues with extension loading
116+ # We use bundled Chromium for reliable extension loading, but still apply stealth features
117+ user_data_dir = tempfile .mkdtemp (prefix = "sentience-profile-" )
118+ use_chrome_channel = False # Disable for now due to extension loading issues
119+
120+ try :
121+ if use_chrome_channel :
122+ # Try with system Chrome first (better evasion, but may have extension issues)
123+ self .context = self .playwright .chromium .launch_persistent_context (
124+ user_data_dir = user_data_dir ,
125+ channel = "chrome" , # Use system Chrome (better evasion)
126+ headless = self .headless ,
127+ args = stealth_args ,
128+ viewport = viewport_config ,
129+ user_agent = user_agent ,
130+ timeout = 30000 ,
131+ )
132+ else :
133+ # Use bundled Chromium (more reliable for extensions)
134+ self .context = self .playwright .chromium .launch_persistent_context (
135+ user_data_dir = user_data_dir ,
136+ headless = self .headless ,
137+ args = stealth_args ,
138+ viewport = viewport_config ,
139+ user_agent = user_agent ,
140+ timeout = 30000 ,
141+ )
142+ except Exception as launch_error :
143+ # Clean up on failure
144+ if os .path .exists (user_data_dir ):
145+ try :
146+ shutil .rmtree (user_data_dir )
147+ except Exception :
148+ pass
149+ raise RuntimeError (
150+ f"Failed to launch browser: { launch_error } \n "
151+ "Make sure Playwright browsers are installed: playwright install chromium"
152+ ) from launch_error
97153
98154 # Get first page or create new one
99155 pages = self .context .pages
@@ -102,31 +158,77 @@ def start(self) -> None:
102158 else :
103159 self .page = self .context .new_page ()
104160
161+ # Apply stealth patches for bot evasion (if available)
162+ if STEALTH_AVAILABLE :
163+ try :
164+ stealth_sync (self .page )
165+ except Exception :
166+ # Silently fail if stealth application fails - not critical
167+ # This is expected if playwright-stealth has compatibility issues
168+ pass
169+
170+ # Verify extension is loaded by checking background page
171+ # This helps catch extension loading issues early
172+ try :
173+ background_pages = [p for p in self .context .background_pages ]
174+ if not background_pages :
175+ # Extension might not have a background page, or it's not loaded yet
176+ # Wait a bit for extension to initialize
177+ self .page .wait_for_timeout (1000 )
178+ except Exception :
179+ # Background pages might not be accessible, continue anyway
180+ pass
181+
105182 # Navigate to a real page so extension can inject
106183 # Extension content scripts only run on actual pages (not about:blank)
107184 # Use a simple page that loads quickly
108- self .page .goto ("https://example.com" , wait_until = "domcontentloaded" )
185+ self .page .goto ("https://example.com" , wait_until = "domcontentloaded" , timeout = 15000 )
109186
110187 # Give extension time to initialize (WASM loading is async)
111- self .page .wait_for_timeout (1000 )
188+ # Content scripts run at document_idle, so we need to wait for that
189+ # Also wait for extension ID to be set by content.js
190+ self .page .wait_for_timeout (3000 )
112191
113192 # Wait for extension to load
114- if not self ._wait_for_extension ():
193+ if not self ._wait_for_extension (timeout = 25000 ):
115194 # Extension might need more time, try waiting a bit longer
116- self .page .wait_for_timeout (2000 )
117- if not self ._wait_for_extension ():
195+ self .page .wait_for_timeout (3000 )
196+ if not self ._wait_for_extension (timeout = 15000 ):
197+ # Get diagnostic info before failing
198+ try :
199+ diagnostic_info = self .page .evaluate ("""
200+ () => {
201+ const info = {
202+ sentience_defined: typeof window.sentience !== 'undefined',
203+ registry_defined: typeof window.sentience_registry !== 'undefined',
204+ snapshot_defined: typeof window.sentience?.snapshot === 'function',
205+ extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set',
206+ url: window.location.href
207+ };
208+ if (window.sentience) {
209+ info.sentience_keys = Object.keys(window.sentience);
210+ }
211+ return info;
212+ }
213+ """ )
214+ diagnostic_str = f"\n 5. Diagnostic info: { diagnostic_info } "
215+ except Exception :
216+ diagnostic_str = "\n 5. Could not get diagnostic info"
217+
118218 raise RuntimeError (
119219 "Extension failed to load after navigation. Make sure:\n "
120220 "1. Extension is built (cd sentience-chrome && ./build.sh)\n "
121221 "2. All files are present (manifest.json, content.js, injected_api.js, pkg/)\n "
122- "3. Check browser console for errors\n "
222+ "3. Check browser console for errors (run with headless=False to see console) \n "
123223 f"4. Extension path: { temp_dir } "
224+ + diagnostic_str
124225 )
125226
126- def _wait_for_extension (self , timeout : int = 15000 ) -> bool :
227+ def _wait_for_extension (self , timeout : int = 20000 ) -> bool :
127228 """Wait for window.sentience API to be available"""
128229 import time
129230 start = time .time ()
231+ last_error = None
130232
131233 while time .time () - start < timeout / 1000 :
132234 try :
@@ -140,21 +242,35 @@ def _wait_for_extension(self, timeout: int = 15000) -> bool:
140242 if (typeof window.sentience.snapshot !== 'function') {
141243 return { ready: false, reason: 'snapshot function not available' };
142244 }
143- // Check if WASM module is loaded
245+ // Check if registry is initialized
144246 if (window.sentience_registry === undefined) {
145247 return { ready: false, reason: 'registry not initialized' };
146248 }
249+ // Check if WASM module is loaded (check internal _wasmModule if available)
250+ const sentience = window.sentience;
251+ if (sentience._wasmModule && !sentience._wasmModule.analyze_page) {
252+ return { ready: false, reason: 'WASM module not fully loaded' };
253+ }
254+ // If _wasmModule is not exposed, that's okay - it might be internal
255+ // Just verify the API structure is correct
147256 return { ready: true };
148257 }
149258 """ )
150259
151- if isinstance (result , dict ) and result .get ("ready" ):
152- return True
260+ if isinstance (result , dict ):
261+ if result .get ("ready" ):
262+ return True
263+ last_error = result .get ("reason" , "Unknown error" )
153264 except Exception as e :
154265 # Continue waiting on errors
155- pass
266+ last_error = f"Evaluation error: { str ( e ) } "
156267
157- time .sleep (0.2 )
268+ time .sleep (0.3 )
269+
270+ # Log the last error for debugging
271+ if last_error :
272+ import warnings
273+ warnings .warn (f"Extension wait timeout. Last status: { last_error } " )
158274
159275 return False
160276
0 commit comments