From aa1c1e874755482d21b509ca1b39dc5444370361 Mon Sep 17 00:00:00 2001 From: YxmMyth Date: Sat, 10 Jan 2026 23:32:57 +0800 Subject: [PATCH] Add CSS background image extraction feature (Issue #1691) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds support for extracting CSS background images during crawling, addressing issue #1691 where background images were being skipped. ## Changes ### New Files - crawl4ai/js_snippet/extract_css_backgrounds.js: JavaScript script to extract background images from computed styles in the browser ### Modified Files - crawl4ai/models.py: - Added `css_images` field to Media class - Added `css_images_data` field to AsyncCrawlResponse - crawl4ai/async_configs.py: - Added CSS background image configuration parameters to CrawlerRunConfig: - extract_css_images (bool, default False) - css_image_min_width (int, default 100) - css_image_min_height (int, default 100) - css_image_score_threshold (int, default 2) - css_exclude_repeating (bool, default True) - crawl4ai/content_scraping_strategy.py: - Added process_css_background_images() method - Integrated CSS image extraction into _process_element() - Added css_images to media dictionary - crawl4ai/async_crawler_strategy.py: - Added JavaScript execution in _crawl_web() to extract CSS backgrounds - Included css_images_data in AsyncCrawlResponse - crawl4ai/async_webcrawler.py: - Modified aprocess_html() to accept and pass css_images_data - Added Dict type import ## Features - Extracts background images from both inline styles and stylesheets - Uses window.getComputedStyle() for accurate extraction - Smart filtering (small elements, repeating patterns) - Scoring system based on element size and properties - Opt-in by default for backward compatibility - Separate storage in media.css_images ## Usage ```python result = await crawler.arun( url="https://example.com", extract_css_images=True, css_image_min_width=100, css_image_min_height=100, ) css_images = result.media.get('css_images', []) ``` Closes #1691 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- crawl4ai/async_configs.py | 12 ++ crawl4ai/async_crawler_strategy.py | 27 +++- crawl4ai/async_webcrawler.py | 7 +- crawl4ai/content_scraping_strategy.py | 110 +++++++++++++- .../js_snippet/extract_css_backgrounds.js | 138 ++++++++++++++++++ crawl4ai/models.py | 2 + 6 files changed, 290 insertions(+), 6 deletions(-) create mode 100644 crawl4ai/js_snippet/extract_css_backgrounds.js diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 10cc48d08..316fb31c5 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1228,6 +1228,12 @@ def __init__( table_extraction: TableExtractionStrategy = None, exclude_external_images: bool = False, exclude_all_images: bool = False, + # CSS Background Images Parameters + extract_css_images: bool = False, + css_image_min_width: int = 100, + css_image_min_height: int = 100, + css_image_score_threshold: int = 2, + css_exclude_repeating: bool = True, # Link and Domain Handling Parameters exclude_social_media_domains: list = None, exclude_external_links: bool = False, @@ -1343,6 +1349,12 @@ def __init__( self.exclude_external_images = exclude_external_images self.exclude_all_images = exclude_all_images self.table_score_threshold = table_score_threshold + # CSS Background Images Parameters + self.extract_css_images = extract_css_images + self.css_image_min_width = css_image_min_width + self.css_image_min_height = css_image_min_height + self.css_image_score_threshold = css_image_score_threshold + self.css_exclude_repeating = css_exclude_repeating # Table extraction strategy (default to DefaultTableExtraction if not specified) if table_extraction is None: diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 2850b36a6..2f1222afa 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -957,7 +957,7 @@ async def handle_request_failed_capture(request): # Handle comma-separated selectors by splitting them selectors = [s.strip() for s in config.css_selector.split(',')] html_parts = [] - + for selector in selectors: try: content = await self.adapter.evaluate(page, @@ -968,13 +968,33 @@ async def handle_request_failed_capture(request): html_parts.append(content) except Error as e: print(f"Warning: Could not get content for selector '{selector}': {str(e)}") - + # Wrap in a div to create a valid HTML structure - html = f"
\n" + "\n".join(html_parts) + "\n
" + html = f"
\n" + "\n".join(html_parts) + "\n
" except Error as e: raise RuntimeError(f"Failed to extract HTML content: {str(e)}") else: html = await page.content() + + # Extract CSS background images if enabled + css_images_data = None + if config.extract_css_images: + try: + js_script = load_js_script("extract_css_backgrounds") + result = await self.adapter.evaluate(page, js_script) + css_images_data = result.get("css_images", []) if result else [] + if self.logger and config.verbose: + self.logger.info( + message=f"Extracted {len(css_images_data)} CSS background images", + tag="CSS_IMAGES", + ) + except Exception as e: + if self.logger: + self.logger.warning( + message=f"Failed to extract CSS background images: {str(e)}", + tag="CSS_IMAGES", + ) + css_images_data = None # # Get final HTML content # html = await page.content() @@ -1047,6 +1067,7 @@ async def get_delayed_content(delay: float = 5.0) -> str: # Include captured data if enabled network_requests=captured_requests if config.capture_network_requests else None, console_messages=captured_console if config.capture_console_messages else None, + css_images_data=css_images_data, ) except Exception as e: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 4dc52adc1..d8d022855 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -3,7 +3,7 @@ import sys import time from pathlib import Path -from typing import Optional, List +from typing import Optional, List, Dict import json import asyncio @@ -342,6 +342,7 @@ async def arun( screenshot_data = async_response.screenshot pdf_data = async_response.pdf_data js_execution_result = async_response.js_execution_result + css_images_data = async_response.css_images_data t2 = time.perf_counter() self.logger.url_status( @@ -366,6 +367,7 @@ async def arun( is_raw_html=True if url.startswith("raw:") else False, redirected_url=async_response.redirected_url, original_scheme=urlparse(url).scheme, + css_images_data=css_images_data, **kwargs, ) @@ -441,6 +443,7 @@ async def aprocess_html( screenshot_data: str, pdf_data: str, verbose: bool, + css_images_data: List[Dict] = None, **kwargs, ) -> CrawlResult: """ @@ -480,7 +483,7 @@ async def aprocess_html( # Scraping Strategy Execution # ################################ result: ScrapingResult = scraping_strategy.scrap( - url, html, **params) + url, html, css_images_data=css_images_data, **params) if result is None: raise ValueError( diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index e915ff5bf..89f93457a 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -340,6 +340,18 @@ def _process_element( except Exception as e: self._log("error", f"Error processing image: {str(e)}", "SCRAPE") + # Process CSS background images (if enabled) + if kwargs.get("extract_css_images", False): + try: + css_images_data = kwargs.get("css_images_data") + processed_css_images = self.process_css_background_images( + css_images_data, url, **kwargs + ) + if processed_css_images: + media["css_images"].extend(processed_css_images) + except Exception as e: + self._log("error", f"Error processing CSS images: {str(e)}", "SCRAPE") + # Process videos and audios for media_type in ["video", "audio"]: for elem in element.xpath(f".//{media_type}"): @@ -514,6 +526,102 @@ def add_variant(src: str, width: Optional[str] = None): return image_variants if image_variants else None + def process_css_background_images( + self, css_images_data: List[Dict], url: str, **kwargs + ) -> Optional[List[Dict]]: + """ + Process CSS background images extracted from browser. + + Args: + css_images_data: Raw data from JavaScript extraction + url: Page URL for resolving relative URLs + **kwargs: Configuration options + + Returns: + List of MediaItem dictionaries or None if no images + """ + from urllib.parse import urljoin + + if not css_images_data: + return None + + processed_images = [] + min_width = kwargs.get("css_image_min_width", 100) + min_height = kwargs.get("css_image_min_height", 100) + exclude_repeating = kwargs.get("css_exclude_repeating", True) + score_threshold = kwargs.get("css_image_score_threshold", 2) + + for img_data in css_images_data: + # Filter by element size + computed_width = img_data.get("computed_width", 0) + computed_height = img_data.get("computed_height", 0) + + if computed_width < min_width or computed_height < min_height: + continue + + # Filter repeating patterns + if exclude_repeating and img_data.get("is_repeating", False): + continue + + # Resolve URL + src = img_data["src"] + if not src.startswith(("http://", "https://", "data:")): + src = urljoin(url, src) + + # Skip data URLs + if src.startswith("data:"): + continue + + # Calculate score based on element properties + score = 0 + + # Larger elements get higher scores + if computed_width > 300: + score += 1 + if computed_height > 300: + score += 1 + + # Non-repeating backgrounds are more likely to be content images + if not img_data.get("is_repeating", False): + score += 1 + + # Detect image format + image_formats = {"jpg", "jpeg", "png", "webp", "avif", "gif", "svg"} + detected_format = None + src_lower = src.lower() + for fmt in image_formats: + if fmt in src_lower: + detected_format = fmt + score += 1 + break + + # Apply score threshold + if score <= score_threshold: + continue + + # Build selector string for description + element_tag = img_data.get("element_tag", "element") + selector = img_data.get("selector", "") + desc = f"CSS background image on {element_tag}" + if selector: + desc += f" ({selector})" + + # Create MediaItem + media_item = { + "src": src, + "alt": f"Background of {element_tag}", + "desc": desc, + "score": score, + "type": "css_image", + "group_id": 0, + "format": detected_format, + "width": computed_width, + } + + processed_images.append(media_item) + + return processed_images if processed_images else None + def remove_empty_elements_fast(self, root, word_count_threshold=5): """ Remove elements that fall below the desired word threshold in a single pass from the bottom up. @@ -730,7 +838,7 @@ def _scrap( form.getparent().remove(form) # Process content - media = {"images": [], "videos": [], "audios": [], "tables": []} + media = {"images": [], "videos": [], "audios": [], "tables": [], "css_images": []} internal_links_dict = {} external_links_dict = {} diff --git a/crawl4ai/js_snippet/extract_css_backgrounds.js b/crawl4ai/js_snippet/extract_css_backgrounds.js new file mode 100644 index 000000000..3554a78d2 --- /dev/null +++ b/crawl4ai/js_snippet/extract_css_backgrounds.js @@ -0,0 +1,138 @@ +/** + * Extract CSS background images from all elements on the page. + * This script is executed by crawl4ai to extract CSS background images. + * + * Returns a JSON object with css_images array containing: + * - src: Image URL + * - selector: CSS selector for the element + * - element_tag: Tag name of the element + * - element_class: Class names of the element + * - element_id: ID of the element + * - style_property: Which CSS property had the image + * - computed_width: Element width in pixels + * - computed_height: Element height in pixels + * - is_repeated: Whether background repeats + * - background_position: CSS background-position value + * - background_size: CSS background-size value + */ + +(function() { + const results = []; + const allElements = document.querySelectorAll('*'); + const processedUrls = new Set(); + + /** + * Generate a unique CSS selector for an element + */ + function getElementSelector(element) { + if (element.id) { + return '#' + element.id; + } + + let selector = element.tagName.toLowerCase(); + + if (element.className && typeof element.className === 'string') { + const classes = element.className.trim().split(/\s+/).filter(c => c); + if (classes.length > 0) { + selector += '.' + classes.join('.'); + } + } + + // Add nth-child if needed for uniqueness + const parent = element.parentElement; + if (parent) { + const siblings = Array.from(parent.children).filter( + child => child.tagName === element.tagName + ); + if (siblings.length > 1) { + const index = siblings.indexOf(element) + 1; + selector += `:nth-child(${index})`; + } + } + + return selector; + } + + /** + * Check if element is visible and has meaningful dimensions + */ + function isElementVisible(element) { + const rect = element.getBoundingClientRect(); + const style = window.getComputedStyle(element); + + // Check if element has display: none or visibility: hidden + if (style.display === 'none' || style.visibility === 'hidden') { + return false; + } + + // Check if element has meaningful dimensions + return rect.width > 0 && rect.height > 0; + } + + allElements.forEach(element => { + // Skip invisible elements + if (!isElementVisible(element)) { + return; + } + + const style = window.getComputedStyle(element); + const backgroundImage = style.backgroundImage; + + // Skip if no background image or if it's 'none' + if (!backgroundImage || backgroundImage === 'none' || backgroundImage === 'initial') { + return; + } + + // Parse url() from background-image property + // Handles: url(...), url("..."), url('...'), and multiple backgrounds + const urlPattern = /url\(['"]?([^'")\s]+)['"]?\)/g; + let match; + const urls = []; + + while ((match = urlPattern.exec(backgroundImage)) !== null) { + urls.push(match[1]); + } + + if (urls.length === 0) { + return; + } + + // Process each URL + urls.forEach(url => { + // Skip data URLs + if (url.startsWith('data:')) { + return; + } + + // Skip already processed URLs (deduplication) + if (processedUrls.has(url)) { + return; + } + processedUrls.add(url); + + // Get element dimensions + const rect = element.getBoundingClientRect(); + + // Create result object + const result = { + src: url, + selector: getElementSelector(element), + element_tag: element.tagName.toLowerCase(), + element_class: element.className || '', + element_id: element.id || '', + style_property: 'background-image', + computed_width: Math.round(rect.width), + computed_height: Math.round(rect.height), + is_repeated: style.backgroundRepeat !== 'no-repeat', + background_position: style.backgroundPosition, + background_size: style.backgroundSize + }; + + results.push(result); + }); + }); + + return { + css_images: results + }; +})(); diff --git a/crawl4ai/models.py b/crawl4ai/models.py index e46bb7fa8..812c0d261 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -330,6 +330,7 @@ class AsyncCrawlResponse(BaseModel): redirected_url: Optional[str] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None + css_images_data: Optional[List[Dict[str, Any]]] = None # CSS background images extracted from browser model_config = ConfigDict(arbitrary_types_allowed=True) @@ -370,6 +371,7 @@ class Media(BaseModel): MediaItem ] = [] # Using MediaItem model for now, can be extended with Audio model if needed tables: List[Dict] = [] # Table data extracted from HTML tables + css_images: List[MediaItem] = [] # CSS background images extracted from styles class Links(BaseModel):