From aa1c1e874755482d21b509ca1b39dc5444370361 Mon Sep 17 00:00:00 2001
From: YxmMyth <stevensgavrila853@gmail.com>
Date: Sat, 10 Jan 2026 23:32:57 +0800
Subject: [PATCH] Add CSS background image extraction feature (Issue #1691)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds support for extracting CSS background images during crawling,
addressing issue #1691 where background images were being skipped.

## Changes

### New Files
- crawl4ai/js_snippet/extract_css_backgrounds.js: JavaScript script to extract
  background images from computed styles in the browser

### Modified Files
- crawl4ai/models.py:
  - Added `css_images` field to Media class
  - Added `css_images_data` field to AsyncCrawlResponse

- crawl4ai/async_configs.py:
  - Added CSS background image configuration parameters to CrawlerRunConfig:
    - extract_css_images (bool, default False)
    - css_image_min_width (int, default 100)
    - css_image_min_height (int, default 100)
    - css_image_score_threshold (int, default 2)
    - css_exclude_repeating (bool, default True)

- crawl4ai/content_scraping_strategy.py:
  - Added process_css_background_images() method
  - Integrated CSS image extraction into _process_element()
  - Added css_images to media dictionary

- crawl4ai/async_crawler_strategy.py:
  - Added JavaScript execution in _crawl_web() to extract CSS backgrounds
  - Included css_images_data in AsyncCrawlResponse

- crawl4ai/async_webcrawler.py:
  - Modified aprocess_html() to accept and pass css_images_data
  - Added Dict type import

## Features
- Extracts background images from both inline styles and stylesheets
- Uses window.getComputedStyle() for accurate extraction
- Smart filtering (small elements, repeating patterns)
- Scoring system based on element size and properties
- Opt-in by default for backward compatibility
- Separate storage in media.css_images

## Usage
```python
result = await crawler.arun(
    url="https://example.com",
    extract_css_images=True,
    css_image_min_width=100,
    css_image_min_height=100,
)

css_images = result.media.get('css_images', [])
```

Closes #1691

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 crawl4ai/async_configs.py                     |  12 ++
 crawl4ai/async_crawler_strategy.py            |  27 +++-
 crawl4ai/async_webcrawler.py                  |   7 +-
 crawl4ai/content_scraping_strategy.py         | 110 +++++++++++++-
 .../js_snippet/extract_css_backgrounds.js     | 138 ++++++++++++++++++
 crawl4ai/models.py                            |   2 +
 6 files changed, 290 insertions(+), 6 deletions(-)
 create mode 100644 crawl4ai/js_snippet/extract_css_backgrounds.js

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 10cc48d08..316fb31c5 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1228,6 +1228,12 @@ def __init__(
         table_extraction: TableExtractionStrategy = None,
         exclude_external_images: bool = False,
         exclude_all_images: bool = False,
+        # CSS Background Images Parameters
+        extract_css_images: bool = False,
+        css_image_min_width: int = 100,
+        css_image_min_height: int = 100,
+        css_image_score_threshold: int = 2,
+        css_exclude_repeating: bool = True,
         # Link and Domain Handling Parameters
         exclude_social_media_domains: list = None,
         exclude_external_links: bool = False,
@@ -1343,6 +1349,12 @@ def __init__(
         self.exclude_external_images = exclude_external_images
         self.exclude_all_images = exclude_all_images
         self.table_score_threshold = table_score_threshold
+        # CSS Background Images Parameters
+        self.extract_css_images = extract_css_images
+        self.css_image_min_width = css_image_min_width
+        self.css_image_min_height = css_image_min_height
+        self.css_image_score_threshold = css_image_score_threshold
+        self.css_exclude_repeating = css_exclude_repeating
         
         # Table extraction strategy (default to DefaultTableExtraction if not specified)
         if table_extraction is None:
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 2850b36a6..2f1222afa 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -957,7 +957,7 @@ async def handle_request_failed_capture(request):
                     # Handle comma-separated selectors by splitting them
                     selectors = [s.strip() for s in config.css_selector.split(',')]
                     html_parts = []
-                    
+
                     for selector in selectors:
                         try:
                             content = await self.adapter.evaluate(page,
@@ -968,13 +968,33 @@ async def handle_request_failed_capture(request):
                             html_parts.append(content)
                         except Error as e:
                             print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
-                    
+
                     # Wrap in a div to create a valid HTML structure
-                    html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"                    
+                    html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
                 except Error as e:
                     raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
             else:
                 html = await page.content()
+
+            # Extract CSS background images if enabled
+            css_images_data = None
+            if config.extract_css_images:
+                try:
+                    js_script = load_js_script("extract_css_backgrounds")
+                    result = await self.adapter.evaluate(page, js_script)
+                    css_images_data = result.get("css_images", []) if result else []
+                    if self.logger and config.verbose:
+                        self.logger.info(
+                            message=f"Extracted {len(css_images_data)} CSS background images",
+                            tag="CSS_IMAGES",
+                        )
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(
+                            message=f"Failed to extract CSS background images: {str(e)}",
+                            tag="CSS_IMAGES",
+                        )
+                    css_images_data = None
             
             # # Get final HTML content
             # html = await page.content()
@@ -1047,6 +1067,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
                 # Include captured data if enabled
                 network_requests=captured_requests if config.capture_network_requests else None,
                 console_messages=captured_console if config.capture_console_messages else None,
+                css_images_data=css_images_data,
             )
 
         except Exception as e:
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 4dc52adc1..d8d022855 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -3,7 +3,7 @@
 import sys
 import time
 from pathlib import Path
-from typing import Optional, List
+from typing import Optional, List, Dict
 import json
 import asyncio
 
@@ -342,6 +342,7 @@ async def arun(
                     screenshot_data = async_response.screenshot
                     pdf_data = async_response.pdf_data
                     js_execution_result = async_response.js_execution_result
+                    css_images_data = async_response.css_images_data
 
                     t2 = time.perf_counter()
                     self.logger.url_status(
@@ -366,6 +367,7 @@ async def arun(
                         is_raw_html=True if url.startswith("raw:") else False,
                         redirected_url=async_response.redirected_url,
                         original_scheme=urlparse(url).scheme,
+                        css_images_data=css_images_data,
                         **kwargs,
                     )
 
@@ -441,6 +443,7 @@ async def aprocess_html(
         screenshot_data: str,
         pdf_data: str,
         verbose: bool,
+        css_images_data: List[Dict] = None,
         **kwargs,
     ) -> CrawlResult:
         """
@@ -480,7 +483,7 @@ async def aprocess_html(
             # Scraping Strategy Execution  #
             ################################
             result: ScrapingResult = scraping_strategy.scrap(
-                url, html, **params)
+                url, html, css_images_data=css_images_data, **params)
 
             if result is None:
                 raise ValueError(
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index e915ff5bf..89f93457a 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -340,6 +340,18 @@ def _process_element(
             except Exception as e:
                 self._log("error", f"Error processing image: {str(e)}", "SCRAPE")
 
+        # Process CSS background images (if enabled)
+        if kwargs.get("extract_css_images", False):
+            try:
+                css_images_data = kwargs.get("css_images_data")
+                processed_css_images = self.process_css_background_images(
+                    css_images_data, url, **kwargs
+                )
+                if processed_css_images:
+                    media["css_images"].extend(processed_css_images)
+            except Exception as e:
+                self._log("error", f"Error processing CSS images: {str(e)}", "SCRAPE")
+
         # Process videos and audios
         for media_type in ["video", "audio"]:
             for elem in element.xpath(f".//{media_type}"):
@@ -514,6 +526,102 @@ def add_variant(src: str, width: Optional[str] = None):
 
         return image_variants if image_variants else None
 
+    def process_css_background_images(
+        self, css_images_data: List[Dict], url: str, **kwargs
+    ) -> Optional[List[Dict]]:
+        """
+        Process CSS background images extracted from browser.
+
+        Args:
+            css_images_data: Raw data from JavaScript extraction
+            url: Page URL for resolving relative URLs
+            **kwargs: Configuration options
+
+        Returns:
+            List of MediaItem dictionaries or None if no images
+        """
+        from urllib.parse import urljoin
+
+        if not css_images_data:
+            return None
+
+        processed_images = []
+        min_width = kwargs.get("css_image_min_width", 100)
+        min_height = kwargs.get("css_image_min_height", 100)
+        exclude_repeating = kwargs.get("css_exclude_repeating", True)
+        score_threshold = kwargs.get("css_image_score_threshold", 2)
+
+        for img_data in css_images_data:
+            # Filter by element size
+            computed_width = img_data.get("computed_width", 0)
+            computed_height = img_data.get("computed_height", 0)
+
+            if computed_width < min_width or computed_height < min_height:
+                continue
+
+            # Filter repeating patterns
+            if exclude_repeating and img_data.get("is_repeating", False):
+                continue
+
+            # Resolve URL
+            src = img_data["src"]
+            if not src.startswith(("http://", "https://", "data:")):
+                src = urljoin(url, src)
+
+            # Skip data URLs
+            if src.startswith("data:"):
+                continue
+
+            # Calculate score based on element properties
+            score = 0
+
+            # Larger elements get higher scores
+            if computed_width > 300:
+                score += 1
+            if computed_height > 300:
+                score += 1
+
+            # Non-repeating backgrounds are more likely to be content images
+            if not img_data.get("is_repeating", False):
+                score += 1
+
+            # Detect image format
+            image_formats = {"jpg", "jpeg", "png", "webp", "avif", "gif", "svg"}
+            detected_format = None
+            src_lower = src.lower()
+            for fmt in image_formats:
+                if fmt in src_lower:
+                    detected_format = fmt
+                    score += 1
+                    break
+
+            # Apply score threshold
+            if score <= score_threshold:
+                continue
+
+            # Build selector string for description
+            element_tag = img_data.get("element_tag", "element")
+            selector = img_data.get("selector", "")
+            desc = f"CSS background image on {element_tag}"
+            if selector:
+                desc += f" ({selector})"
+
+            # Create MediaItem
+            media_item = {
+                "src": src,
+                "alt": f"Background of {element_tag}",
+                "desc": desc,
+                "score": score,
+                "type": "css_image",
+                "group_id": 0,
+                "format": detected_format,
+                "width": computed_width,
+            }
+
+            processed_images.append(media_item)
+
+        return processed_images if processed_images else None
+
     def remove_empty_elements_fast(self, root, word_count_threshold=5):
         """
         Remove elements that fall below the desired word threshold in a single pass from the bottom up.
@@ -730,7 +838,7 @@ def _scrap(
                         form.getparent().remove(form)
 
             # Process content
-            media = {"images": [], "videos": [], "audios": [], "tables": []}
+            media = {"images": [], "videos": [], "audios": [], "tables": [], "css_images": []}
             internal_links_dict = {}
             external_links_dict = {}
 
diff --git a/crawl4ai/js_snippet/extract_css_backgrounds.js b/crawl4ai/js_snippet/extract_css_backgrounds.js
new file mode 100644
index 000000000..3554a78d2
--- /dev/null
+++ b/crawl4ai/js_snippet/extract_css_backgrounds.js
@@ -0,0 +1,138 @@
+/**
+ * Extract CSS background images from all elements on the page.
+ * This script is executed by crawl4ai to extract CSS background images.
+ *
+ * Returns a JSON object with css_images array containing:
+ * - src: Image URL
+ * - selector: CSS selector for the element
+ * - element_tag: Tag name of the element
+ * - element_class: Class names of the element
+ * - element_id: ID of the element
+ * - style_property: Which CSS property had the image
+ * - computed_width: Element width in pixels
+ * - computed_height: Element height in pixels
+ * - is_repeated: Whether background repeats
+ * - background_position: CSS background-position value
+ * - background_size: CSS background-size value
+ */
+
+(function() {
+    const results = [];
+    const allElements = document.querySelectorAll('*');
+    const processedUrls = new Set();
+
+    /**
+     * Generate a unique CSS selector for an element
+     */
+    function getElementSelector(element) {
+        if (element.id) {
+            return '#' + element.id;
+        }
+
+        let selector = element.tagName.toLowerCase();
+
+        if (element.className && typeof element.className === 'string') {
+            const classes = element.className.trim().split(/\s+/).filter(c => c);
+            if (classes.length > 0) {
+                selector += '.' + classes.join('.');
+            }
+        }
+
+        // Add nth-child if needed for uniqueness
+        const parent = element.parentElement;
+        if (parent) {
+            const siblings = Array.from(parent.children).filter(
+                child => child.tagName === element.tagName
+            );
+            if (siblings.length > 1) {
+                const index = siblings.indexOf(element) + 1;
+                selector += `:nth-child(${index})`;
+            }
+        }
+
+        return selector;
+    }
+
+    /**
+     * Check if element is visible and has meaningful dimensions
+     */
+    function isElementVisible(element) {
+        const rect = element.getBoundingClientRect();
+        const style = window.getComputedStyle(element);
+
+        // Check if element has display: none or visibility: hidden
+        if (style.display === 'none' || style.visibility === 'hidden') {
+            return false;
+        }
+
+        // Check if element has meaningful dimensions
+        return rect.width > 0 && rect.height > 0;
+    }
+
+    allElements.forEach(element => {
+        // Skip invisible elements
+        if (!isElementVisible(element)) {
+            return;
+        }
+
+        const style = window.getComputedStyle(element);
+        const backgroundImage = style.backgroundImage;
+
+        // Skip if no background image or if it's 'none'
+        if (!backgroundImage || backgroundImage === 'none' || backgroundImage === 'initial') {
+            return;
+        }
+
+        // Parse url() from background-image property
+        // Handles: url(...), url("..."), url('...'), and multiple backgrounds
+        const urlPattern = /url\(['"]?([^'")\s]+)['"]?\)/g;
+        let match;
+        const urls = [];
+
+        while ((match = urlPattern.exec(backgroundImage)) !== null) {
+            urls.push(match[1]);
+        }
+
+        if (urls.length === 0) {
+            return;
+        }
+
+        // Process each URL
+        urls.forEach(url => {
+            // Skip data URLs
+            if (url.startsWith('data:')) {
+                return;
+            }
+
+            // Skip already processed URLs (deduplication)
+            if (processedUrls.has(url)) {
+                return;
+            }
+            processedUrls.add(url);
+
+            // Get element dimensions
+            const rect = element.getBoundingClientRect();
+
+            // Create result object
+            const result = {
+                src: url,
+                selector: getElementSelector(element),
+                element_tag: element.tagName.toLowerCase(),
+                element_class: element.className || '',
+                element_id: element.id || '',
+                style_property: 'background-image',
+                computed_width: Math.round(rect.width),
+                computed_height: Math.round(rect.height),
+                is_repeated: style.backgroundRepeat !== 'no-repeat',
+                background_position: style.backgroundPosition,
+                background_size: style.backgroundSize
+            };
+
+            results.push(result);
+        });
+    });
+
+    return {
+        css_images: results
+    };
+})();
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index e46bb7fa8..812c0d261 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -330,6 +330,7 @@ class AsyncCrawlResponse(BaseModel):
     redirected_url: Optional[str] = None
     network_requests: Optional[List[Dict[str, Any]]] = None
     console_messages: Optional[List[Dict[str, Any]]] = None
+    css_images_data: Optional[List[Dict[str, Any]]] = None  # CSS background images extracted from browser
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -370,6 +371,7 @@ class Media(BaseModel):
         MediaItem
     ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
     tables: List[Dict] = []  # Table data extracted from HTML tables
+    css_images: List[MediaItem] = []  # CSS background images extracted from styles
 
 
 class Links(BaseModel):