diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 10cc48d08..316fb31c5 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1228,6 +1228,12 @@ def __init__(
table_extraction: TableExtractionStrategy = None,
exclude_external_images: bool = False,
exclude_all_images: bool = False,
+ # CSS Background Images Parameters
+ extract_css_images: bool = False,
+ css_image_min_width: int = 100,
+ css_image_min_height: int = 100,
+ css_image_score_threshold: int = 2,
+ css_exclude_repeating: bool = True,
# Link and Domain Handling Parameters
exclude_social_media_domains: list = None,
exclude_external_links: bool = False,
@@ -1343,6 +1349,12 @@ def __init__(
self.exclude_external_images = exclude_external_images
self.exclude_all_images = exclude_all_images
self.table_score_threshold = table_score_threshold
+ # CSS Background Images Parameters
+ self.extract_css_images = extract_css_images
+ self.css_image_min_width = css_image_min_width
+ self.css_image_min_height = css_image_min_height
+ self.css_image_score_threshold = css_image_score_threshold
+ self.css_exclude_repeating = css_exclude_repeating
# Table extraction strategy (default to DefaultTableExtraction if not specified)
if table_extraction is None:
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 2850b36a6..2f1222afa 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -957,7 +957,7 @@ async def handle_request_failed_capture(request):
# Handle comma-separated selectors by splitting them
selectors = [s.strip() for s in config.css_selector.split(',')]
html_parts = []
-
+
for selector in selectors:
try:
content = await self.adapter.evaluate(page,
@@ -968,13 +968,33 @@ async def handle_request_failed_capture(request):
html_parts.append(content)
except Error as e:
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
-
+
# Wrap in a div to create a valid HTML structure
- html = f"
\n" + "\n".join(html_parts) + "\n
"
+ html = f"\n" + "\n".join(html_parts) + "\n
"
except Error as e:
raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
else:
html = await page.content()
+
+ # Extract CSS background images if enabled
+ css_images_data = None
+ if config.extract_css_images:
+ try:
+ js_script = load_js_script("extract_css_backgrounds")
+ result = await self.adapter.evaluate(page, js_script)
+ css_images_data = result.get("css_images", []) if result else []
+ if self.logger and config.verbose:
+ self.logger.info(
+ message=f"Extracted {len(css_images_data)} CSS background images",
+ tag="CSS_IMAGES",
+ )
+ except Exception as e:
+ if self.logger:
+ self.logger.warning(
+ message=f"Failed to extract CSS background images: {str(e)}",
+ tag="CSS_IMAGES",
+ )
+ css_images_data = None
# # Get final HTML content
# html = await page.content()
@@ -1047,6 +1067,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
# Include captured data if enabled
network_requests=captured_requests if config.capture_network_requests else None,
console_messages=captured_console if config.capture_console_messages else None,
+ css_images_data=css_images_data,
)
except Exception as e:
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 4dc52adc1..d8d022855 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -3,7 +3,7 @@
import sys
import time
from pathlib import Path
-from typing import Optional, List
+from typing import Optional, List, Dict
import json
import asyncio
@@ -342,6 +342,7 @@ async def arun(
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
js_execution_result = async_response.js_execution_result
+ css_images_data = async_response.css_images_data
t2 = time.perf_counter()
self.logger.url_status(
@@ -366,6 +367,7 @@ async def arun(
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
+ css_images_data=css_images_data,
**kwargs,
)
@@ -441,6 +443,7 @@ async def aprocess_html(
screenshot_data: str,
pdf_data: str,
verbose: bool,
+ css_images_data: List[Dict] = None,
**kwargs,
) -> CrawlResult:
"""
@@ -480,7 +483,7 @@ async def aprocess_html(
# Scraping Strategy Execution #
################################
result: ScrapingResult = scraping_strategy.scrap(
- url, html, **params)
+ url, html, css_images_data=css_images_data, **params)
if result is None:
raise ValueError(
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index e915ff5bf..89f93457a 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -340,6 +340,18 @@ def _process_element(
except Exception as e:
self._log("error", f"Error processing image: {str(e)}", "SCRAPE")
+ # Process CSS background images (if enabled)
+ if kwargs.get("extract_css_images", False):
+ try:
+ css_images_data = kwargs.get("css_images_data")
+ processed_css_images = self.process_css_background_images(
+ css_images_data, url, **kwargs
+ )
+ if processed_css_images:
+ media["css_images"].extend(processed_css_images)
+ except Exception as e:
+ self._log("error", f"Error processing CSS images: {str(e)}", "SCRAPE")
+
# Process videos and audios
for media_type in ["video", "audio"]:
for elem in element.xpath(f".//{media_type}"):
@@ -514,6 +526,102 @@ def add_variant(src: str, width: Optional[str] = None):
return image_variants if image_variants else None
+ def process_css_background_images(
+ self, css_images_data: List[Dict], url: str, **kwargs
+ ) -> Optional[List[Dict]]:
+ """
+ Process CSS background images extracted from browser.
+
+ Args:
+ css_images_data: Raw data from JavaScript extraction
+ url: Page URL for resolving relative URLs
+ **kwargs: Configuration options
+
+ Returns:
+ List of MediaItem dictionaries or None if no images
+ """
+ from urllib.parse import urljoin
+
+ if not css_images_data:
+ return None
+
+ processed_images = []
+ min_width = kwargs.get("css_image_min_width", 100)
+ min_height = kwargs.get("css_image_min_height", 100)
+ exclude_repeating = kwargs.get("css_exclude_repeating", True)
+ score_threshold = kwargs.get("css_image_score_threshold", 2)
+
+ for img_data in css_images_data:
+ # Filter by element size
+ computed_width = img_data.get("computed_width", 0)
+ computed_height = img_data.get("computed_height", 0)
+
+ if computed_width < min_width or computed_height < min_height:
+ continue
+
+ # Filter repeating patterns
+ if exclude_repeating and img_data.get("is_repeating", False):
+ continue
+
+ # Resolve URL
+ src = img_data["src"]
+ if not src.startswith(("http://", "https://", "data:")):
+ src = urljoin(url, src)
+
+ # Skip data URLs
+ if src.startswith("data:"):
+ continue
+
+ # Calculate score based on element properties
+ score = 0
+
+ # Larger elements get higher scores
+ if computed_width > 300:
+ score += 1
+ if computed_height > 300:
+ score += 1
+
+ # Non-repeating backgrounds are more likely to be content images
+ if not img_data.get("is_repeating", False):
+ score += 1
+
+ # Detect image format
+ image_formats = {"jpg", "jpeg", "png", "webp", "avif", "gif", "svg"}
+ detected_format = None
+ src_lower = src.lower()
+ for fmt in image_formats:
+ if fmt in src_lower:
+ detected_format = fmt
+ score += 1
+ break
+
+ # Apply score threshold
+ if score <= score_threshold:
+ continue
+
+ # Build selector string for description
+ element_tag = img_data.get("element_tag", "element")
+ selector = img_data.get("selector", "")
+ desc = f"CSS background image on {element_tag}"
+ if selector:
+ desc += f" ({selector})"
+
+ # Create MediaItem
+ media_item = {
+ "src": src,
+ "alt": f"Background of {element_tag}",
+ "desc": desc,
+ "score": score,
+ "type": "css_image",
+ "group_id": 0,
+ "format": detected_format,
+ "width": computed_width,
+ }
+
+ processed_images.append(media_item)
+
+ return processed_images if processed_images else None
+
def remove_empty_elements_fast(self, root, word_count_threshold=5):
"""
Remove elements that fall below the desired word threshold in a single pass from the bottom up.
@@ -730,7 +838,7 @@ def _scrap(
form.getparent().remove(form)
# Process content
- media = {"images": [], "videos": [], "audios": [], "tables": []}
+ media = {"images": [], "videos": [], "audios": [], "tables": [], "css_images": []}
internal_links_dict = {}
external_links_dict = {}
diff --git a/crawl4ai/js_snippet/extract_css_backgrounds.js b/crawl4ai/js_snippet/extract_css_backgrounds.js
new file mode 100644
index 000000000..3554a78d2
--- /dev/null
+++ b/crawl4ai/js_snippet/extract_css_backgrounds.js
@@ -0,0 +1,138 @@
+/**
+ * Extract CSS background images from all elements on the page.
+ * This script is executed by crawl4ai to extract CSS background images.
+ *
+ * Returns a JSON object with css_images array containing:
+ * - src: Image URL
+ * - selector: CSS selector for the element
+ * - element_tag: Tag name of the element
+ * - element_class: Class names of the element
+ * - element_id: ID of the element
+ * - style_property: Which CSS property had the image
+ * - computed_width: Element width in pixels
+ * - computed_height: Element height in pixels
+ * - is_repeated: Whether background repeats
+ * - background_position: CSS background-position value
+ * - background_size: CSS background-size value
+ */
+
+(function() {
+ const results = [];
+ const allElements = document.querySelectorAll('*');
+ const processedUrls = new Set();
+
+ /**
+ * Generate a unique CSS selector for an element
+ */
+ function getElementSelector(element) {
+ if (element.id) {
+ return '#' + element.id;
+ }
+
+ let selector = element.tagName.toLowerCase();
+
+ if (element.className && typeof element.className === 'string') {
+ const classes = element.className.trim().split(/\s+/).filter(c => c);
+ if (classes.length > 0) {
+ selector += '.' + classes.join('.');
+ }
+ }
+
+ // Add nth-child if needed for uniqueness
+ const parent = element.parentElement;
+ if (parent) {
+ const siblings = Array.from(parent.children).filter(
+ child => child.tagName === element.tagName
+ );
+ if (siblings.length > 1) {
+ const index = siblings.indexOf(element) + 1;
+ selector += `:nth-child(${index})`;
+ }
+ }
+
+ return selector;
+ }
+
+ /**
+ * Check if element is visible and has meaningful dimensions
+ */
+ function isElementVisible(element) {
+ const rect = element.getBoundingClientRect();
+ const style = window.getComputedStyle(element);
+
+ // Check if element has display: none or visibility: hidden
+ if (style.display === 'none' || style.visibility === 'hidden') {
+ return false;
+ }
+
+ // Check if element has meaningful dimensions
+ return rect.width > 0 && rect.height > 0;
+ }
+
+ allElements.forEach(element => {
+ // Skip invisible elements
+ if (!isElementVisible(element)) {
+ return;
+ }
+
+ const style = window.getComputedStyle(element);
+ const backgroundImage = style.backgroundImage;
+
+ // Skip if no background image or if it's 'none'
+ if (!backgroundImage || backgroundImage === 'none' || backgroundImage === 'initial') {
+ return;
+ }
+
+ // Parse url() from background-image property
+ // Handles: url(...), url("..."), url('...'), and multiple backgrounds
+ const urlPattern = /url\(['"]?([^'")\s]+)['"]?\)/g;
+ let match;
+ const urls = [];
+
+ while ((match = urlPattern.exec(backgroundImage)) !== null) {
+ urls.push(match[1]);
+ }
+
+ if (urls.length === 0) {
+ return;
+ }
+
+ // Process each URL
+ urls.forEach(url => {
+ // Skip data URLs
+ if (url.startsWith('data:')) {
+ return;
+ }
+
+ // Skip already processed URLs (deduplication)
+ if (processedUrls.has(url)) {
+ return;
+ }
+ processedUrls.add(url);
+
+ // Get element dimensions
+ const rect = element.getBoundingClientRect();
+
+ // Create result object
+ const result = {
+ src: url,
+ selector: getElementSelector(element),
+ element_tag: element.tagName.toLowerCase(),
+ element_class: element.className || '',
+ element_id: element.id || '',
+ style_property: 'background-image',
+ computed_width: Math.round(rect.width),
+ computed_height: Math.round(rect.height),
+ is_repeated: style.backgroundRepeat !== 'no-repeat',
+ background_position: style.backgroundPosition,
+ background_size: style.backgroundSize
+ };
+
+ results.push(result);
+ });
+ });
+
+ return {
+ css_images: results
+ };
+})();
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index e46bb7fa8..812c0d261 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -330,6 +330,7 @@ class AsyncCrawlResponse(BaseModel):
redirected_url: Optional[str] = None
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
+ css_images_data: Optional[List[Dict[str, Any]]] = None # CSS background images extracted from browser
model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -370,6 +371,7 @@ class Media(BaseModel):
MediaItem
] = [] # Using MediaItem model for now, can be extended with Audio model if needed
tables: List[Dict] = [] # Table data extracted from HTML tables
+ css_images: List[MediaItem] = [] # CSS background images extracted from styles
class Links(BaseModel):