From 8d619f57065c05619a14ad8da9f94f334500c17f Mon Sep 17 00:00:00 2001 From: Yurii Chukhlib Date: Sat, 17 Jan 2026 11:16:24 +0100 Subject: [PATCH] fix: Handle markdown-wrapped JSON in LLM schema generation Fixes #1663 Claude Sonnet and other LLMs sometimes return valid JSON wrapped in markdown code blocks (\`\`\`json...\`\`\`), causing JSONDecodeError in JsonCssExtractionStrategy.generate_schema(). Added pre-processing to strip markdown code block markers before JSON parsing, handling both \`\`\`json and \`\`\` formats. Co-Authored-By: Claude --- crawl4ai/extraction_strategy.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 7033e3800..8ecf73446 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1380,7 +1380,15 @@ def generate_schema( ) # Extract and return schema - return json.loads(response.choices[0].message.content) + # Clean markdown code blocks that LLMs sometimes wrap JSON in + content = response.choices[0].message.content + # Remove markdown code block markers if present + if "```json" in content: + content = content.replace("```json\n", "").replace("\n```", "") + elif "```" in content: + content = content.replace("```\n", "").replace("\n```", "") + content = content.strip() + return json.loads(content) except Exception as e: raise Exception(f"Failed to generate schema: {str(e)}")