From c07799df75f3e08214783a7aef36fb7cbb953dd8 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Tue, 16 Dec 2025 16:08:26 +0200 Subject: [PATCH 01/15] sql-syntax-feedback --- api/agents/__init__.py | 2 + api/agents/analysis_agent.py | 25 ++- api/agents/healer_agent.py | 288 +++++++++++++++++++++++++++++++++++ api/agents/utils.py | 37 ++++- api/core/text2sql.py | 90 +++++++---- 5 files changed, 408 insertions(+), 34 deletions(-) create mode 100644 api/agents/healer_agent.py diff --git a/api/agents/__init__.py b/api/agents/__init__.py index efd63f4e..a15e120e 100644 --- a/api/agents/__init__.py +++ b/api/agents/__init__.py @@ -4,6 +4,7 @@ from .relevancy_agent import RelevancyAgent from .follow_up_agent import FollowUpAgent from .response_formatter_agent import ResponseFormatterAgent +from .healer_agent import HealerAgent from .utils import parse_response __all__ = [ @@ -11,5 +12,6 @@ "RelevancyAgent", "FollowUpAgent", "ResponseFormatterAgent", + "HealerAgent", "parse_response" ] diff --git a/api/agents/analysis_agent.py b/api/agents/analysis_agent.py index ccd7c98a..ff2f76e9 100644 --- a/api/agents/analysis_agent.py +++ b/api/agents/analysis_agent.py @@ -18,18 +18,25 @@ def get_analysis( # pylint: disable=too-many-arguments, too-many-positional-arg db_description: str, instructions: str | None = None, memory_context: str | None = None, + database_type: str | None = None, ) -> dict: """Get analysis of user query against database schema.""" formatted_schema = self._format_schema(combined_tables) + # Add system message with database type if not already present + if not self.messages or self.messages[0].get("role") != "system": + self.messages.insert(0, { + "role": "system", + "content": f"You are a SQL expert. TARGET DATABASE: {database_type.upper() if database_type else 'UNKNOWN'}" + }) + prompt = self._build_prompt( - user_query, formatted_schema, db_description, instructions, memory_context + user_query, formatted_schema, db_description, instructions, memory_context, database_type ) self.messages.append({"role": "user", "content": prompt}) completion_result = completion( model=Config.COMPLETION_MODEL, messages=self.messages, temperature=0, - top_p=1, ) response = completion_result.choices[0].message.content @@ -158,7 +165,8 @@ def _format_foreign_keys(self, foreign_keys: dict) -> str: def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-arguments self, user_input: str, formatted_schema: str, - db_description: str, instructions, memory_context: str | None = None + db_description: str, instructions, memory_context: str | None = None, + database_type: str | None = None, ) -> str: """ Build the prompt for Claude to analyze the query. @@ -169,6 +177,7 @@ def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-a db_description: Description of the database instructions: Custom instructions for the query memory_context: User and database memory context from previous interactions + database_type: Target database type (sqlite, postgresql, mysql, etc.) Returns: The formatted prompt for Claude @@ -196,6 +205,8 @@ def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-a prompt = f""" You must strictly follow the instructions below. Deviations will result in a penalty to your confidence score. + TARGET DATABASE: {database_type.upper() if database_type else 'UNKNOWN'} + MANDATORY RULES: - Always explain if you cannot fully follow the instructions. - Always reduce the confidence score if instructions cannot be fully applied. @@ -203,6 +214,10 @@ def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-a - Respond ONLY in strict JSON format, without extra text. - If the query relates to a previous question, you MUST take into account the previous question and its answer, and answer based on the context and information provided so far. - CRITICAL: When table or column names contain special characters (especially dashes/hyphens like '-'), you MUST wrap them in double quotes for PostgreSQL (e.g., "table-name") or backticks for MySQL (e.g., `table-name`). This is NON-NEGOTIABLE. + - CRITICAL NULL HANDLING: When using calculated columns (divisions, ratios, arithmetic) with ORDER BY or LIMIT, you MUST filter out NULL values. Add "WHERE calculated_expression IS NOT NULL" or include the NULL check in your WHERE clause. NULL values sort first in ascending order and can produce incorrect results. + - CRITICAL SELECT CLAUSE: Only return columns explicitly requested in the question. If the question asks for "the highest rate" or "the lowest value", return ONLY that calculated value, not additional columns like names or IDs unless specifically asked. Use aggregate functions (MAX, MIN, AVG) when appropriate for "highest", "lowest", "average" queries instead of ORDER BY + LIMIT. + - CRITICAL VALUE MATCHING: When multiple columns could answer a question (e.g., "continuation schools"), prefer the column whose allowed values list contains an EXACT or CLOSEST string match to the question term. For example, if the question mentions "continuation schools", prefer a column with value "Continuation School" over "Continuation High Schools". Check the column descriptions for "Optional values" lists and match question terminology to those exact value strings. + - CRITICAL SINGLE SQL STATEMENT: You MUST generate exactly ONE SQL statement that answers all parts of the question. NEVER generate multiple separate SELECT statements. If a question asks multiple things (e.g., "How many X? List Y"), combine them into a single query using subqueries, JOINs, multiple columns in SELECT, or aggregate functions. Multiple SQL statements separated by semicolons are FORBIDDEN and will fail execution. If the user is asking a follow-up or continuing question, use the conversation history and previous answers to resolve references, context, or ambiguities. Always base your analysis on the cumulative context, not just the current question. @@ -299,6 +314,10 @@ def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-a 12. Learn from successful query patterns in memory context and avoid failed approaches. 13. For personal queries, FIRST check memory context for user identification. If user identity is found in memory context (user name, previous personal queries, etc.), the query IS translatable. 14. CRITICAL PERSONALIZATION CHECK: If missing user identification/personalization is a significant or primary component of the query (e.g., "show my orders", "my account balance", "my recent purchases", "how many employees I have", "products I own") AND no user identification is available in memory context or schema, set "is_sql_translatable" to false. However, if memory context contains user identification (like user name or previous successful personal queries), then personal queries ARE translatable even if they are the primary component of the query. + 15. CRITICAL: When generating queries with calculated columns (division, multiplication, etc.) that are used in ORDER BY or compared with LIMIT, ALWAYS add NULL filtering. For example: "WHERE (column1 / column2) IS NOT NULL" before ORDER BY. This prevents NULL values (from NULL numerators or denominators) from appearing in results. + 16. SELECT CLAUSE PRECISION: Only include columns explicitly requested in the question. If a question asks "What is the highest rate?" return ONLY the rate value, not additional columns. Questions asking for "the highest/lowest/average X" should prefer aggregate functions (MAX, MIN, AVG) over ORDER BY + LIMIT, as aggregates are more concise and automatically handle what to return. + 17. VALUE-BASED COLUMN SELECTION: When choosing between similar columns (e.g., "School Type" vs "Educational Option Type"), examine the "Optional values" lists in column descriptions. Prefer the column where a value EXACTLY or MOST CLOSELY matches the terminology in the question. For example, "continuation schools" should map to a column with value "Continuation School" rather than "Continuation High Schools". This string matching takes priority over column name similarity. + 18. NULL HANDLING IN CALCULATIONS: When a query involves calculated expressions (like col1/col2) used with ORDER BY, filtering (WHERE), or LIMIT, ensure NULL values are explicitly filtered out. Use "AND (expression) IS NOT NULL" in the WHERE clause. This is especially important for division operations where either numerator or denominator can be NULL. Again: OUTPUT ONLY VALID JSON. No explanations outside the JSON block. """ # pylint: disable=line-too-long return prompt diff --git a/api/agents/healer_agent.py b/api/agents/healer_agent.py new file mode 100644 index 00000000..5759faee --- /dev/null +++ b/api/agents/healer_agent.py @@ -0,0 +1,288 @@ +""" +HealerAgent - Specialized agent for fixing SQL syntax errors. + +This agent focuses solely on correcting SQL queries that failed execution, +without requiring full graph context. It uses the error message and the +failed query to generate a corrected version. +""" + +import json +import re +from typing import Dict, Optional +from litellm import completion +from .utils import parse_response +from api.config import Config + + + +class HealerAgent: + """Agent specialized in fixing SQL syntax errors.""" + + def __init__(self): + """ + Initialize the HealerAgent. + + """ + + @staticmethod + def validate_sql_syntax(sql_query: str) -> dict: + """ + Validate SQL query for basic syntax errors. + Similar to CypherValidator in the text-to-cypher PR. + + Args: + sql_query: The SQL query to validate + + Returns: + dict with 'is_valid', 'errors', and 'warnings' keys + """ + errors = [] + warnings = [] + + query = sql_query.strip() + + # Check if query is empty + if not query: + errors.append("Query is empty") + return {"is_valid": False, "errors": errors, "warnings": warnings} + + # Check for basic SQL keywords + query_upper = query.upper() + has_sql_keywords = any( + kw in query_upper for kw in ["SELECT", "INSERT", "UPDATE", "DELETE", "WITH", "CREATE"] + ) + if not has_sql_keywords: + errors.append("Query does not contain valid SQL keywords") + + # Check for dangerous operations (for dev/test safety) + dangerous_patterns = [ + r'\bDROP\s+TABLE\b', r'\bTRUNCATE\b', r'\bDELETE\s+FROM\s+\w+\s*;?\s*$' + ] + for pattern in dangerous_patterns: + if re.search(pattern, query_upper): + warnings.append(f"Query contains potentially dangerous operation: {pattern}") + + # Check for balanced parentheses + paren_count = 0 + for char in query: + if char == '(': + paren_count += 1 + elif char == ')': + paren_count -= 1 + if paren_count < 0: + errors.append("Unbalanced parentheses in query") + break + if paren_count != 0: + errors.append("Unbalanced parentheses in query") + + # Check for SELECT queries have proper structure + if query_upper.startswith("SELECT") or "SELECT" in query_upper: + if "FROM" not in query_upper and "DUAL" not in query_upper: + warnings.append("SELECT query missing FROM clause") + + return { + "is_valid": len(errors) == 0, + "errors": errors, + "warnings": warnings + } + + def heal_query( + self, + failed_sql: str, + error_message: str, + db_description: str = "", + question: str = "", + database_type: str = "sqlite" + ) -> Dict[str, any]: + """ + Attempt to fix a failed SQL query using only the error message. + + Args: + failed_sql: The SQL query that failed + error_message: The error message from execution + db_description: Optional database description + question: Optional original question + database_type: Type of database (sqlite, postgresql, mysql, etc.) + + Returns: + Dict containing: + - sql_query: Fixed SQL query + - confidence: Confidence score + - explanation: Explanation of the fix + - changes_made: List of changes applied + """ + # Validate SQL syntax for additional error context + validation_result = self.validate_sql_syntax(failed_sql) + additional_context = "" + if validation_result["errors"]: + additional_context += f"\nSyntax errors: {', '.join(validation_result['errors'])}" + if validation_result["warnings"]: + additional_context += f"\nWarnings: {', '.join(validation_result['warnings'])}" + + # Enhance error message with validation context + enhanced_error = error_message + additional_context + + # Build focused prompt for SQL healing + prompt = self._build_healing_prompt( + failed_sql=failed_sql, + error_message=enhanced_error, + db_description=db_description, + question=question, + database_type=database_type + ) + + try: + # Call LLM for healing + response = completion( + model=Config.COMPLETION_MODEL, + messages=[{"role": "user", "content": prompt}], + temperature=0.1, # Low temperature for precision + max_tokens=2000 + ) + + content = response.choices[0].message.content + + # Parse the response + result = parse_response(content) + + # Validate the result has required fields + if not result.get("sql_query"): + return { + "sql_query": failed_sql, # Return original if healing failed + "confidence": 0.0, + "explanation": "Failed to parse healed SQL from response", + "changes_made": [], + "healing_failed": True + } + + return { + "sql_query": result.get("sql_query", ""), + "confidence": result.get("confidence", 50), + "explanation": result.get("explanation", ""), + "changes_made": result.get("changes_made", []), + "healing_failed": False + } + + except Exception as e: + return { + "sql_query": failed_sql, # Return original on error + "confidence": 0.0, + "explanation": f"Healing error: {str(e)}", + "changes_made": [], + "healing_failed": True + } + + def _build_healing_prompt( + self, + failed_sql: str, + error_message: str, + db_description: str, + question: str, + database_type: str + ) -> str: + """Build a focused prompt for SQL query healing.""" + + # Analyze error to provide targeted hints + error_hints = self._analyze_error(error_message, database_type) + + prompt = f"""You are a SQL query debugging expert. Your task is to fix a SQL query that failed execution. + +DATABASE TYPE: {database_type.upper()} + +FAILED SQL QUERY: +```sql +{failed_sql} +``` + +EXECUTION ERROR: +{error_message} + +{f"ORIGINAL QUESTION: {question}" if question else ""} + +{f"DATABASE INFO: {db_description[:500]}" if db_description else ""} + +COMMON ERROR PATTERNS: +{error_hints} + +YOUR TASK: +1. Identify the exact cause of the error +2. Fix ONLY what's broken - don't rewrite the entire query +3. Ensure the fix is compatible with {database_type.upper()} +4. Maintain the original query logic and intent + +CRITICAL RULES FOR {database_type.upper()}: +""" + + if database_type == "sqlite": + prompt += """ +- SQLite does NOT support EXTRACT() function - use strftime() instead + * EXTRACT(YEAR FROM date_col) → strftime('%Y', date_col) + * EXTRACT(MONTH FROM date_col) → strftime('%m', date_col) + * EXTRACT(DAY FROM date_col) → strftime('%d', date_col) +- SQLite column/table names are case-insensitive BUT must exist +- SQLite uses double quotes "column" for identifiers with special characters +- Use backticks `column` for compatibility +- No schema qualifiers (database.table.column) +""" + elif database_type == "postgresql": + prompt += """ +- PostgreSQL is case-sensitive - use double quotes for mixed-case identifiers +- EXTRACT() is supported: EXTRACT(YEAR FROM date_col) +- Column references must match exact case when quoted +""" + + prompt += """ +RESPONSE FORMAT (valid JSON only): +{ + "sql_query": "-- your fixed SQL query here", + "confidence": 85, + "explanation": "Brief explanation of what was fixed", + "changes_made": ["Changed EXTRACT to strftime", "Fixed column casing"] +} + +IMPORTANT: +- Return ONLY the JSON object, no other text +- Fix ONLY the specific error, preserve the rest +- Test your fix mentally before responding +- If error is about a column/table name, check spelling carefully +""" + + return prompt + + def _analyze_error(self, error_message: str, database_type: str) -> str: + """Analyze error message and provide targeted hints.""" + + error_lower = error_message.lower() + hints = [] + + # Common SQLite errors + if database_type == "sqlite": + if "near \"from\"" in error_lower or "syntax error" in error_lower: + hints.append("⚠️ EXTRACT() is NOT supported in SQLite - use strftime() instead!") + hints.append(" Example: strftime('%Y', date_column) for year") + + if "no such column" in error_lower: + hints.append("⚠️ Column name doesn't exist - check spelling and case") + hints.append(" SQLite is case-insensitive but the column must exist") + + if "no such table" in error_lower: + hints.append("⚠️ Table name doesn't exist - check spelling") + + if "ambiguous column" in error_lower: + hints.append("⚠️ Ambiguous column - use table alias: table.column or alias.column") + + # PostgreSQL errors + elif database_type == "postgresql": + if "column" in error_lower and "does not exist" in error_lower: + hints.append("⚠️ Column case mismatch - PostgreSQL is case-sensitive") + hints.append(' Use double quotes for mixed-case: "ColumnName"') + + if "relation" in error_lower and "does not exist" in error_lower: + hints.append("⚠️ Table doesn't exist or case mismatch") + + # Generic hints if no specific patterns matched + if not hints: + hints.append("⚠️ Check syntax compatibility with " + database_type.upper()) + hints.append("⚠️ Verify column and table names exist") + + return "\n".join(hints) diff --git a/api/agents/utils.py b/api/agents/utils.py index 53e678a0..9fa0c9ac 100644 --- a/api/agents/utils.py +++ b/api/agents/utils.py @@ -21,6 +21,7 @@ def __init__(self, queries_history: list, result_history: list): def parse_response(response: str) -> Dict[str, Any]: """ Parse Claude's response to extract the analysis. + Handles cases where LLM returns multiple JSON blocks by extracting the last valid one. Args: response: Claude's response string @@ -29,14 +30,42 @@ def parse_response(response: str) -> Dict[str, Any]: Parsed analysis results """ try: - # Extract JSON from the response + # Try to find all JSON blocks (anything between { and }) + # and parse the last valid one (LLM sometimes corrects itself) + + # Find all potential JSON blocks + json_blocks = [] + depth = 0 + start_idx = None + + for i, char in enumerate(response): + if char == '{': + if depth == 0: + start_idx = i + depth += 1 + elif char == '}': + depth -= 1 + if depth == 0 and start_idx is not None: + json_blocks.append(response[start_idx:i+1]) + start_idx = None + + # Try to parse JSON blocks from last to first (prefer the corrected version) + for json_str in reversed(json_blocks): + try: + analysis = json.loads(json_str) + # Validate it has required fields + if "is_sql_translatable" in analysis and "sql_query" in analysis: + return analysis + except json.JSONDecodeError: + continue + + # Fallback to original method if block parsing fails json_start = response.find("{") json_end = response.rfind("}") + 1 json_str = response[json_start:json_end] - - # Parse the JSON analysis = json.loads(json_str) return analysis + except (json.JSONDecodeError, ValueError) as e: # Fallback if JSON parsing fails return { @@ -44,4 +73,4 @@ def parse_response(response: str) -> Dict[str, Any]: "confidence": 0, "explanation": f"Failed to parse response: {str(e)}", "error": str(response), - } + } \ No newline at end of file diff --git a/api/core/text2sql.py b/api/core/text2sql.py index 70ad1143..f94a2249 100644 --- a/api/core/text2sql.py +++ b/api/core/text2sql.py @@ -12,6 +12,7 @@ from api.core.errors import GraphNotFoundError, InternalError, InvalidArgumentError from api.core.schema_loader import load_database from api.agents import AnalysisAgent, RelevancyAgent, ResponseFormatterAgent, FollowUpAgent +from api.agents.healer_agent import HealerAgent from api.config import Config from api.extensions import db from api.graph import find, get_db_description @@ -252,7 +253,7 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m db_description, db_url = await get_db_description(graph_id) # Determine database type and get appropriate loader - _, loader_class = get_database_type_and_loader(db_url) + db_type, loader_class = get_database_type_and_loader(db_url) if not loader_class: overall_elapsed = time.perf_counter() - overall_start @@ -309,7 +310,8 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m logging.info("Starting SQL generation with analysis agent") answer_an = agent_an.get_analysis( - queries_history[-1], result, db_description, instructions, memory_context + queries_history[-1], result, db_description, instructions, memory_context, + db_type ) # Initialize response variables @@ -317,14 +319,27 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m follow_up_result = "" execution_error = False - # Auto-quote table names with special characters (like dashes) - original_sql = answer_an['sql_query'] - if original_sql: + logging.info("Generated SQL query: %s", answer_an['sql_query']) # nosemgrep + yield json.dumps( + { + "type": "sql_query", + "data": answer_an["sql_query"], + "conf": answer_an["confidence"], + "miss": answer_an["missing_information"], + "amb": answer_an["ambiguities"], + "exp": answer_an["explanation"], + "is_valid": answer_an["is_sql_translatable"], + "final_response": False, + } + ) + MESSAGE_DELIMITER + + # If the SQL query is valid, execute it using the postgres database db_url + if answer_an["is_sql_translatable"]: + # Auto-quote table names with special characters (like dashes) # Extract known table names from the result schema known_tables = {table[0] for table in result} if result else set() # Determine database type and get appropriate quote character - db_type, _ = get_database_type_and_loader(db_url) quote_char = DatabaseSpecificQuoter.get_quote_char( db_type or 'postgresql' ) @@ -332,7 +347,7 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m # Auto-quote identifiers with special characters sanitized_sql, was_modified = ( SQLIdentifierQuoter.auto_quote_identifiers( - original_sql, known_tables, quote_char + answer_an['sql_query'], known_tables, quote_char ) ) @@ -344,22 +359,6 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m logging.info(msg) answer_an['sql_query'] = sanitized_sql - logging.info("Generated SQL query: %s", answer_an['sql_query']) # nosemgrep - yield json.dumps( - { - "type": "sql_query", - "data": answer_an["sql_query"], - "conf": answer_an["confidence"], - "miss": answer_an["missing_information"], - "amb": answer_an["ambiguities"], - "exp": answer_an["explanation"], - "is_valid": answer_an["is_sql_translatable"], - "final_response": False, - } - ) + MESSAGE_DELIMITER - - # If the SQL query is valid, execute it using the postgres database db_url - if answer_an["is_sql_translatable"]: # Check if this is a destructive operation that requires confirmation sql_query = answer_an["sql_query"] sql_type = sql_query.strip().split()[0].upper() if sql_query else "" @@ -441,10 +440,47 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m loader_class.is_schema_modifying_query(sql_query) ) - query_results = loader_class.execute_sql_query( - answer_an["sql_query"], - db_url - ) + # Try executing the SQL query, with healing on failure + try: + query_results = loader_class.execute_sql_query( + answer_an["sql_query"], + db_url + ) + except Exception as exec_error: # pylint: disable=broad-exception-caught + # Attempt healing + step = {"type": "reasoning_step", + "final_response": False, + "message": "Step 2a: SQL execution failed, attempting to heal query..."} + yield json.dumps(step) + MESSAGE_DELIMITER + + healing_result = HealerAgent().heal_query( + failed_sql=answer_an["sql_query"], + error_message=str(exec_error), + db_description=db_description[:500] if db_description else "", + question=queries_history[-1], + database_type=db_type + ) + + yield json.dumps({ + "type": "healing_attempt", + "final_response": False, + "message": f"Query was automatically fixed. Changes made: {', '.join(healing_result.get('changes_made', []))}", + "original_error": str(exec_error), + "healed_sql": healing_result.get("sql_query", "") + }) + MESSAGE_DELIMITER + + # Execute healed SQL + query_results = loader_class.execute_sql_query( + healing_result["sql_query"], + db_url + ) + answer_an["sql_query"] = healing_result["sql_query"] + + yield json.dumps({ + "type": "healing_success", + "final_response": False, + "message": "✅ Healed query executed successfully" + }) + MESSAGE_DELIMITER if len(query_results) != 0: yield json.dumps( { From 410080793a79ba95c7b7139100bbb2292f3887b4 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Tue, 16 Dec 2025 16:44:02 +0200 Subject: [PATCH 02/15] add-exception-feedback --- api/core/text2sql.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/api/core/text2sql.py b/api/core/text2sql.py index f94a2249..0a27c399 100644 --- a/api/core/text2sql.py +++ b/api/core/text2sql.py @@ -461,6 +461,9 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m database_type=db_type ) + if healing_result.get("healing_failed"): + raise exec_error + yield json.dumps({ "type": "healing_attempt", "final_response": False, @@ -470,17 +473,21 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m }) + MESSAGE_DELIMITER # Execute healed SQL - query_results = loader_class.execute_sql_query( - healing_result["sql_query"], - db_url - ) - answer_an["sql_query"] = healing_result["sql_query"] - - yield json.dumps({ - "type": "healing_success", - "final_response": False, - "message": "✅ Healed query executed successfully" - }) + MESSAGE_DELIMITER + try: + query_results = loader_class.execute_sql_query( + healing_result["sql_query"], + db_url + ) + answer_an["sql_query"] = healing_result["sql_query"] + + yield json.dumps({ + "type": "healing_success", + "final_response": False, + "message": "✅ Healed query executed successfully" + }) + MESSAGE_DELIMITER + except Exception as healed_error: # pylint: disable=broad-exception-caught + logging.error("Healed query also failed: %s", str(healed_error)) + raise healed_error if len(query_results) != 0: yield json.dumps( { From 971f23a23ef62e5082206c0c49fe1a7341529d51 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Tue, 16 Dec 2025 16:47:29 +0200 Subject: [PATCH 03/15] update-prompt --- api/agents/analysis_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/agents/analysis_agent.py b/api/agents/analysis_agent.py index ff2f76e9..e650a104 100644 --- a/api/agents/analysis_agent.py +++ b/api/agents/analysis_agent.py @@ -217,7 +217,6 @@ def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-a - CRITICAL NULL HANDLING: When using calculated columns (divisions, ratios, arithmetic) with ORDER BY or LIMIT, you MUST filter out NULL values. Add "WHERE calculated_expression IS NOT NULL" or include the NULL check in your WHERE clause. NULL values sort first in ascending order and can produce incorrect results. - CRITICAL SELECT CLAUSE: Only return columns explicitly requested in the question. If the question asks for "the highest rate" or "the lowest value", return ONLY that calculated value, not additional columns like names or IDs unless specifically asked. Use aggregate functions (MAX, MIN, AVG) when appropriate for "highest", "lowest", "average" queries instead of ORDER BY + LIMIT. - CRITICAL VALUE MATCHING: When multiple columns could answer a question (e.g., "continuation schools"), prefer the column whose allowed values list contains an EXACT or CLOSEST string match to the question term. For example, if the question mentions "continuation schools", prefer a column with value "Continuation School" over "Continuation High Schools". Check the column descriptions for "Optional values" lists and match question terminology to those exact value strings. - - CRITICAL SINGLE SQL STATEMENT: You MUST generate exactly ONE SQL statement that answers all parts of the question. NEVER generate multiple separate SELECT statements. If a question asks multiple things (e.g., "How many X? List Y"), combine them into a single query using subqueries, JOINs, multiple columns in SELECT, or aggregate functions. Multiple SQL statements separated by semicolons are FORBIDDEN and will fail execution. If the user is asking a follow-up or continuing question, use the conversation history and previous answers to resolve references, context, or ambiguities. Always base your analysis on the cumulative context, not just the current question. From 66914ff74a92146d5f57b3d24cc370523dcc27bd Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Tue, 16 Dec 2025 16:50:35 +0200 Subject: [PATCH 04/15] Update api/agents/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/agents/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/agents/utils.py b/api/agents/utils.py index 9fa0c9ac..071b8ddd 100644 --- a/api/agents/utils.py +++ b/api/agents/utils.py @@ -65,7 +65,6 @@ def parse_response(response: str) -> Dict[str, Any]: json_str = response[json_start:json_end] analysis = json.loads(json_str) return analysis - except (json.JSONDecodeError, ValueError) as e: # Fallback if JSON parsing fails return { From d606e6a89e98e6e81256a5931224ed3b4cbbeb79 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Tue, 16 Dec 2025 16:51:54 +0200 Subject: [PATCH 05/15] Update api/core/text2sql.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/core/text2sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/text2sql.py b/api/core/text2sql.py index 0a27c399..2e8622c8 100644 --- a/api/core/text2sql.py +++ b/api/core/text2sql.py @@ -333,7 +333,7 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m } ) + MESSAGE_DELIMITER - # If the SQL query is valid, execute it using the postgres database db_url + # If the SQL query is valid, execute it using the configured database and db_url if answer_an["is_sql_translatable"]: # Auto-quote table names with special characters (like dashes) # Extract known table names from the result schema From ef82f87af14353fde69b7c1b55730d0148cd46c0 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Tue, 16 Dec 2025 16:52:04 +0200 Subject: [PATCH 06/15] Update api/agents/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/agents/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/agents/utils.py b/api/agents/utils.py index 071b8ddd..964ba814 100644 --- a/api/agents/utils.py +++ b/api/agents/utils.py @@ -32,7 +32,6 @@ def parse_response(response: str) -> Dict[str, Any]: try: # Try to find all JSON blocks (anything between { and }) # and parse the last valid one (LLM sometimes corrects itself) - # Find all potential JSON blocks json_blocks = [] depth = 0 From f4981485f026d7e5c92e47f7d54b67b5bba01dbb Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Tue, 16 Dec 2025 16:58:10 +0200 Subject: [PATCH 07/15] pylint --- api/agents/healer_agent.py | 7 ++++--- api/core/text2sql.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/api/agents/healer_agent.py b/api/agents/healer_agent.py index 5759faee..2013bf0e 100644 --- a/api/agents/healer_agent.py +++ b/api/agents/healer_agent.py @@ -5,13 +5,14 @@ without requiring full graph context. It uses the error message and the failed query to generate a corrected version. """ +# pylint: disable=trailing-whitespace,line-too-long,too-many-arguments +# pylint: disable=too-many-positional-arguments,broad-exception-caught -import json import re -from typing import Dict, Optional +from typing import Dict from litellm import completion -from .utils import parse_response from api.config import Config +from .utils import parse_response diff --git a/api/core/text2sql.py b/api/core/text2sql.py index 2e8622c8..27db7e81 100644 --- a/api/core/text2sql.py +++ b/api/core/text2sql.py @@ -1,4 +1,5 @@ """Graph-related routes for the text2sql API.""" +# pylint: disable=line-too-long,trailing-whitespace import asyncio import json From d5f4a90049ea699c9955cba6126497265b68b6c3 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Thu, 18 Dec 2025 16:55:11 +0200 Subject: [PATCH 08/15] Update api/agents/healer_agent.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/agents/healer_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/agents/healer_agent.py b/api/agents/healer_agent.py index 2013bf0e..9b882571 100644 --- a/api/agents/healer_agent.py +++ b/api/agents/healer_agent.py @@ -119,7 +119,6 @@ def heal_query( additional_context += f"\nSyntax errors: {', '.join(validation_result['errors'])}" if validation_result["warnings"]: additional_context += f"\nWarnings: {', '.join(validation_result['warnings'])}" - # Enhance error message with validation context enhanced_error = error_message + additional_context From f282ae86cb7d321aab786fb44c17906884f90154 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Sun, 4 Jan 2026 15:01:52 +0200 Subject: [PATCH 09/15] pylint-fixes --- api/agents/analysis_agent.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/api/agents/analysis_agent.py b/api/agents/analysis_agent.py index e650a104..d941da27 100644 --- a/api/agents/analysis_agent.py +++ b/api/agents/analysis_agent.py @@ -26,11 +26,15 @@ def get_analysis( # pylint: disable=too-many-arguments, too-many-positional-arg if not self.messages or self.messages[0].get("role") != "system": self.messages.insert(0, { "role": "system", - "content": f"You are a SQL expert. TARGET DATABASE: {database_type.upper() if database_type else 'UNKNOWN'}" + "content": ( + f"You are a SQL expert. TARGET DATABASE: " + f"{database_type.upper() if database_type else 'UNKNOWN'}" + ) }) - + prompt = self._build_prompt( - user_query, formatted_schema, db_description, instructions, memory_context, database_type + user_query, formatted_schema, db_description, + instructions, memory_context, database_type ) self.messages.append({"role": "user", "content": prompt}) completion_result = completion( From a8a3b45e5056e645a122cc9e1e7081480d03a9a7 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Sun, 4 Jan 2026 15:03:52 +0200 Subject: [PATCH 10/15] pylint-fixes --- api/agents/utils.py | 8 ++++---- api/core/text2sql.py | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/api/agents/utils.py b/api/agents/utils.py index 964ba814..ceafff23 100644 --- a/api/agents/utils.py +++ b/api/agents/utils.py @@ -36,7 +36,7 @@ def parse_response(response: str) -> Dict[str, Any]: json_blocks = [] depth = 0 start_idx = None - + for i, char in enumerate(response): if char == '{': if depth == 0: @@ -47,7 +47,7 @@ def parse_response(response: str) -> Dict[str, Any]: if depth == 0 and start_idx is not None: json_blocks.append(response[start_idx:i+1]) start_idx = None - + # Try to parse JSON blocks from last to first (prefer the corrected version) for json_str in reversed(json_blocks): try: @@ -57,7 +57,7 @@ def parse_response(response: str) -> Dict[str, Any]: return analysis except json.JSONDecodeError: continue - + # Fallback to original method if block parsing fails json_start = response.find("{") json_end = response.rfind("}") + 1 @@ -71,4 +71,4 @@ def parse_response(response: str) -> Dict[str, Any]: "confidence": 0, "explanation": f"Failed to parse response: {str(e)}", "error": str(response), - } \ No newline at end of file + } diff --git a/api/core/text2sql.py b/api/core/text2sql.py index 08efa714..e797a690 100644 --- a/api/core/text2sql.py +++ b/api/core/text2sql.py @@ -449,15 +449,17 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m ) except Exception as exec_error: # pylint: disable=broad-exception-caught # Attempt healing - step = {"type": "reasoning_step", + step = { + "type": "reasoning_step", "final_response": False, - "message": "Step 2a: SQL execution failed, attempting to heal query..."} + "message": "Step 2a: SQL execution failed, attempting to heal query..." + } yield json.dumps(step) + MESSAGE_DELIMITER healing_result = HealerAgent().heal_query( failed_sql=answer_an["sql_query"], error_message=str(exec_error), - db_description=db_description[:500] if db_description else "", + db_description=db_description, question=queries_history[-1], database_type=db_type ) From dea38a34fa05df11c0f077834515da0b0596d89a Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Sun, 4 Jan 2026 16:40:15 +0200 Subject: [PATCH 11/15] update-fl --- api/agents/healer_agent.py | 213 +++++++++++++++++++++---------------- api/core/text2sql.py | 80 ++++++++------ 2 files changed, 172 insertions(+), 121 deletions(-) diff --git a/api/agents/healer_agent.py b/api/agents/healer_agent.py index 9b882571..371543aa 100644 --- a/api/agents/healer_agent.py +++ b/api/agents/healer_agent.py @@ -9,7 +9,7 @@ # pylint: disable=too-many-positional-arguments,broad-exception-caught import re -from typing import Dict +from typing import Dict, Callable, Any from litellm import completion from api.config import Config from .utils import parse_response @@ -19,11 +19,13 @@ class HealerAgent: """Agent specialized in fixing SQL syntax errors.""" - def __init__(self): - """ - Initialize the HealerAgent. - + def __init__(self, max_healing_attempts: int = 3): + """Initialize the healer agent. + + Args: + max_healing_attempts: Maximum number of healing attempts before giving up """ + self.max_healing_attempts = max_healing_attempts @staticmethod def validate_sql_syntax(sql_query: str) -> dict: @@ -87,91 +89,6 @@ def validate_sql_syntax(sql_query: str) -> dict: "warnings": warnings } - def heal_query( - self, - failed_sql: str, - error_message: str, - db_description: str = "", - question: str = "", - database_type: str = "sqlite" - ) -> Dict[str, any]: - """ - Attempt to fix a failed SQL query using only the error message. - - Args: - failed_sql: The SQL query that failed - error_message: The error message from execution - db_description: Optional database description - question: Optional original question - database_type: Type of database (sqlite, postgresql, mysql, etc.) - - Returns: - Dict containing: - - sql_query: Fixed SQL query - - confidence: Confidence score - - explanation: Explanation of the fix - - changes_made: List of changes applied - """ - # Validate SQL syntax for additional error context - validation_result = self.validate_sql_syntax(failed_sql) - additional_context = "" - if validation_result["errors"]: - additional_context += f"\nSyntax errors: {', '.join(validation_result['errors'])}" - if validation_result["warnings"]: - additional_context += f"\nWarnings: {', '.join(validation_result['warnings'])}" - # Enhance error message with validation context - enhanced_error = error_message + additional_context - - # Build focused prompt for SQL healing - prompt = self._build_healing_prompt( - failed_sql=failed_sql, - error_message=enhanced_error, - db_description=db_description, - question=question, - database_type=database_type - ) - - try: - # Call LLM for healing - response = completion( - model=Config.COMPLETION_MODEL, - messages=[{"role": "user", "content": prompt}], - temperature=0.1, # Low temperature for precision - max_tokens=2000 - ) - - content = response.choices[0].message.content - - # Parse the response - result = parse_response(content) - - # Validate the result has required fields - if not result.get("sql_query"): - return { - "sql_query": failed_sql, # Return original if healing failed - "confidence": 0.0, - "explanation": "Failed to parse healed SQL from response", - "changes_made": [], - "healing_failed": True - } - - return { - "sql_query": result.get("sql_query", ""), - "confidence": result.get("confidence", 50), - "explanation": result.get("explanation", ""), - "changes_made": result.get("changes_made", []), - "healing_failed": False - } - - except Exception as e: - return { - "sql_query": failed_sql, # Return original on error - "confidence": 0.0, - "explanation": f"Healing error: {str(e)}", - "changes_made": [], - "healing_failed": True - } - def _build_healing_prompt( self, failed_sql: str, @@ -199,7 +116,7 @@ def _build_healing_prompt( {f"ORIGINAL QUESTION: {question}" if question else ""} -{f"DATABASE INFO: {db_description[:500]}" if db_description else ""} +{f"DATABASE INFO: {db_description}"} COMMON ERROR PATTERNS: {error_hints} @@ -249,6 +166,120 @@ def _build_healing_prompt( return prompt + def heal_and_execute( + self, + initial_sql: str, + initial_error: str, + execute_sql_func: Callable[[str], Any], + db_description: str = "", + question: str = "", + database_type: str = "sqlite" + ) -> Dict[str, any]: + """Iteratively heal and execute SQL query until success or max attempts. + + This method creates a conversation loop between the healer and the database: + 1. Build initial prompt once with the failed SQL and error (including syntax validation) + 2. Loop: Call LLM → Parse healed SQL → Execute → Check if successful + 3. If successful, return results + 4. If failed and not last attempt, add error feedback and repeat + 5. If failed on last attempt, return failure + + Args: + initial_sql: The initial SQL query that failed + initial_error: The error message from the initial execution failure + execute_sql_func: Function that executes SQL and returns results or raises exception + db_description: Optional database description + question: Optional original question + database_type: Type of database (sqlite, postgresql, mysql, etc.) + + Returns: + Dict containing: + - success: Whether healing succeeded + - sql_query: Final SQL query (healed or original) + - query_results: Results from successful execution (if success=True) + - attempts: Number of healing attempts made + - final_error: Final error message (if success=False) + """ + self.messages = [] + + # Validate SQL syntax for additional error context + validation_result = self.validate_sql_syntax(initial_sql) + additional_context = "" + if validation_result["errors"]: + additional_context += f"\nSyntax errors: {', '.join(validation_result['errors'])}" + if validation_result["warnings"]: + additional_context += f"\nWarnings: {', '.join(validation_result['warnings'])}" + # Enhance error message with validation context + enhanced_error = initial_error + additional_context + + # Build initial prompt once before the loop + prompt = self._build_healing_prompt( + failed_sql=initial_sql, + error_message=enhanced_error, + db_description=db_description, + question=question, + database_type=database_type + ) + self.messages.append({"role": "user", "content": prompt}) + + for attempt in range(self.max_healing_attempts): + # Call LLM + response = completion( + model=Config.COMPLETION_MODEL, + messages=self.messages, + temperature=0.1, + max_tokens=2000 + ) + + content = response.choices[0].message.content + self.messages.append({"role": "assistant", "content": content}) + + # Parse response + result = parse_response(content) + healed_sql = result.get("sql_query", "") + + # Execute against database + error = None + try: + query_results = execute_sql_func(healed_sql) + except Exception as e: + error = str(e) + + # Check if it worked + if error is None: + # Success! + return { + "success": True, + "sql_query": healed_sql, + "query_results": query_results, + "attempts": attempt + 1, + "final_error": None + } + + # Failed - check if last attempt + if attempt >= self.max_healing_attempts - 1: + return { + "success": False, + "sql_query": healed_sql, + "query_results": None, + "attempts": attempt + 1, + "final_error": error + } + + # Not last attempt - add feedback and continue + feedback = f"""The healed query failed with error: + +```sql +{healed_sql} +``` + +ERROR: +{error} + +Please fix this error.""" + self.messages.append({"role": "user", "content": feedback}) + + def _analyze_error(self, error_message: str, database_type: str) -> str: """Analyze error message and provide targeted hints.""" diff --git a/api/core/text2sql.py b/api/core/text2sql.py index e797a690..9db90c4e 100644 --- a/api/core/text2sql.py +++ b/api/core/text2sql.py @@ -441,56 +441,76 @@ async def generate(): # pylint: disable=too-many-locals,too-many-branches,too-m loader_class.is_schema_modifying_query(sql_query) ) - # Try executing the SQL query, with healing on failure + # Try executing the SQL query first try: query_results = loader_class.execute_sql_query( answer_an["sql_query"], db_url ) except Exception as exec_error: # pylint: disable=broad-exception-caught - # Attempt healing + # Initial execution failed - start iterative healing process step = { "type": "reasoning_step", - "final_response": False, - "message": "Step 2a: SQL execution failed, attempting to heal query..." - } + "final_response": False, + "message": "Step 2a: SQL execution failed, attempting to heal query..." + } yield json.dumps(step) + MESSAGE_DELIMITER - healing_result = HealerAgent().heal_query( - failed_sql=answer_an["sql_query"], - error_message=str(exec_error), + # Create healer agent and attempt iterative healing + healer_agent = HealerAgent(max_healing_attempts=3) + + # Create a wrapper function for execute_sql_query + def execute_sql(sql: str): + return loader_class.execute_sql_query(sql, db_url) + + healing_result = healer_agent.heal_and_execute( + initial_sql=answer_an["sql_query"], + initial_error=str(exec_error), + execute_sql_func=execute_sql, db_description=db_description, question=queries_history[-1], database_type=db_type ) - if healing_result.get("healing_failed"): + if not healing_result.get("success"): + # Healing failed after all attempts + yield json.dumps({ + "type": "healing_failed", + "final_response": False, + "message": f"❌ Failed to heal query after {healing_result['attempts']} attempt(s)", + "final_error": healing_result.get("final_error", str(exec_error)), + "healing_log": healing_result.get("healing_log", []) + }) + MESSAGE_DELIMITER raise exec_error + # Healing succeeded! + healing_log = healing_result.get("healing_log", []) + + # Show healing progress + for log_entry in healing_log: + if log_entry.get("status") == "healed": + changes_msg = ", ".join(log_entry.get("changes_made", [])) + yield json.dumps({ + "type": "healing_attempt", + "final_response": False, + "message": f"Attempt {log_entry['attempt']}: {changes_msg}", + "attempt": log_entry["attempt"], + "changes": log_entry.get("changes_made", []), + "confidence": log_entry.get("confidence", 0) + }) + MESSAGE_DELIMITER + + # Update the SQL query to the healed version + answer_an["sql_query"] = healing_result["sql_query"] + query_results = healing_result["query_results"] + yield json.dumps({ - "type": "healing_attempt", + "type": "healing_success", "final_response": False, - "message": f"Query was automatically fixed. Changes made: {', '.join(healing_result.get('changes_made', []))}", - "original_error": str(exec_error), - "healed_sql": healing_result.get("sql_query", "") + "message": f"✅ Query healed and executed successfully after {healing_result['attempts'] + 1} attempt(s)", + "healed_sql": healing_result["sql_query"], + "attempts": healing_result["attempts"] + 1 }) + MESSAGE_DELIMITER - - # Execute healed SQL - try: - query_results = loader_class.execute_sql_query( - healing_result["sql_query"], - db_url - ) - answer_an["sql_query"] = healing_result["sql_query"] - - yield json.dumps({ - "type": "healing_success", - "final_response": False, - "message": "✅ Healed query executed successfully" - }) + MESSAGE_DELIMITER - except Exception as healed_error: # pylint: disable=broad-exception-caught - logging.error("Healed query also failed: %s", str(healed_error)) - raise healed_error + if len(query_results) != 0: yield json.dumps( { From e99ded38d67535aadd9eea41e99671cf818ed3a4 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Sun, 4 Jan 2026 16:48:40 +0200 Subject: [PATCH 12/15] Update api/agents/healer_agent.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/agents/healer_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/agents/healer_agent.py b/api/agents/healer_agent.py index 371543aa..f0b11294 100644 --- a/api/agents/healer_agent.py +++ b/api/agents/healer_agent.py @@ -174,7 +174,7 @@ def heal_and_execute( db_description: str = "", question: str = "", database_type: str = "sqlite" - ) -> Dict[str, any]: + ) -> Dict[str, Any]: """Iteratively heal and execute SQL query until success or max attempts. This method creates a conversation loop between the healer and the database: From 536d5068c5953ebec63ec4759999203f23b2268b Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Mon, 5 Jan 2026 09:57:25 +0200 Subject: [PATCH 13/15] pylint-fix --- api/agents/healer_agent.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/api/agents/healer_agent.py b/api/agents/healer_agent.py index f0b11294..b3899a70 100644 --- a/api/agents/healer_agent.py +++ b/api/agents/healer_agent.py @@ -26,6 +26,7 @@ def __init__(self, max_healing_attempts: int = 3): max_healing_attempts: Maximum number of healing attempts before giving up """ self.max_healing_attempts = max_healing_attempts + self.messages = [] @staticmethod def validate_sql_syntax(sql_query: str) -> dict: @@ -166,7 +167,7 @@ def _build_healing_prompt( return prompt - def heal_and_execute( + def heal_and_execute( # pylint: disable=too-many-locals self, initial_sql: str, initial_error: str, @@ -279,6 +280,15 @@ def heal_and_execute( Please fix this error.""" self.messages.append({"role": "user", "content": feedback}) + # Fallback return + return { + "success": False, + "sql_query": initial_sql, + "query_results": None, + "attempts": self.max_healing_attempts, + "final_error": initial_error + } + def _analyze_error(self, error_message: str, database_type: str) -> str: """Analyze error message and provide targeted hints.""" From 3c9db6e226a936151d2a93b6157fb901c275f615 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Mon, 5 Jan 2026 10:25:05 +0200 Subject: [PATCH 14/15] whiteline --- api/agents/healer_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/agents/healer_agent.py b/api/agents/healer_agent.py index b3899a70..e0ab66a6 100644 --- a/api/agents/healer_agent.py +++ b/api/agents/healer_agent.py @@ -15,7 +15,6 @@ from .utils import parse_response - class HealerAgent: """Agent specialized in fixing SQL syntax errors.""" From c226dba9b5212216353172207c25c237f2325519 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Mon, 5 Jan 2026 11:23:38 +0200 Subject: [PATCH 15/15] rm-analysis-prompt --- api/agents/analysis_agent.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/api/agents/analysis_agent.py b/api/agents/analysis_agent.py index d941da27..c7bccc8a 100644 --- a/api/agents/analysis_agent.py +++ b/api/agents/analysis_agent.py @@ -218,9 +218,6 @@ def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-a - Respond ONLY in strict JSON format, without extra text. - If the query relates to a previous question, you MUST take into account the previous question and its answer, and answer based on the context and information provided so far. - CRITICAL: When table or column names contain special characters (especially dashes/hyphens like '-'), you MUST wrap them in double quotes for PostgreSQL (e.g., "table-name") or backticks for MySQL (e.g., `table-name`). This is NON-NEGOTIABLE. - - CRITICAL NULL HANDLING: When using calculated columns (divisions, ratios, arithmetic) with ORDER BY or LIMIT, you MUST filter out NULL values. Add "WHERE calculated_expression IS NOT NULL" or include the NULL check in your WHERE clause. NULL values sort first in ascending order and can produce incorrect results. - - CRITICAL SELECT CLAUSE: Only return columns explicitly requested in the question. If the question asks for "the highest rate" or "the lowest value", return ONLY that calculated value, not additional columns like names or IDs unless specifically asked. Use aggregate functions (MAX, MIN, AVG) when appropriate for "highest", "lowest", "average" queries instead of ORDER BY + LIMIT. - - CRITICAL VALUE MATCHING: When multiple columns could answer a question (e.g., "continuation schools"), prefer the column whose allowed values list contains an EXACT or CLOSEST string match to the question term. For example, if the question mentions "continuation schools", prefer a column with value "Continuation School" over "Continuation High Schools". Check the column descriptions for "Optional values" lists and match question terminology to those exact value strings. If the user is asking a follow-up or continuing question, use the conversation history and previous answers to resolve references, context, or ambiguities. Always base your analysis on the cumulative context, not just the current question. @@ -317,10 +314,6 @@ def _build_prompt( # pylint: disable=too-many-arguments, too-many-positional-a 12. Learn from successful query patterns in memory context and avoid failed approaches. 13. For personal queries, FIRST check memory context for user identification. If user identity is found in memory context (user name, previous personal queries, etc.), the query IS translatable. 14. CRITICAL PERSONALIZATION CHECK: If missing user identification/personalization is a significant or primary component of the query (e.g., "show my orders", "my account balance", "my recent purchases", "how many employees I have", "products I own") AND no user identification is available in memory context or schema, set "is_sql_translatable" to false. However, if memory context contains user identification (like user name or previous successful personal queries), then personal queries ARE translatable even if they are the primary component of the query. - 15. CRITICAL: When generating queries with calculated columns (division, multiplication, etc.) that are used in ORDER BY or compared with LIMIT, ALWAYS add NULL filtering. For example: "WHERE (column1 / column2) IS NOT NULL" before ORDER BY. This prevents NULL values (from NULL numerators or denominators) from appearing in results. - 16. SELECT CLAUSE PRECISION: Only include columns explicitly requested in the question. If a question asks "What is the highest rate?" return ONLY the rate value, not additional columns. Questions asking for "the highest/lowest/average X" should prefer aggregate functions (MAX, MIN, AVG) over ORDER BY + LIMIT, as aggregates are more concise and automatically handle what to return. - 17. VALUE-BASED COLUMN SELECTION: When choosing between similar columns (e.g., "School Type" vs "Educational Option Type"), examine the "Optional values" lists in column descriptions. Prefer the column where a value EXACTLY or MOST CLOSELY matches the terminology in the question. For example, "continuation schools" should map to a column with value "Continuation School" rather than "Continuation High Schools". This string matching takes priority over column name similarity. - 18. NULL HANDLING IN CALCULATIONS: When a query involves calculated expressions (like col1/col2) used with ORDER BY, filtering (WHERE), or LIMIT, ensure NULL values are explicitly filtered out. Use "AND (expression) IS NOT NULL" in the WHERE clause. This is especially important for division operations where either numerator or denominator can be NULL. Again: OUTPUT ONLY VALID JSON. No explanations outside the JSON block. """ # pylint: disable=line-too-long return prompt