diff --git a/deploy_ai_search/README.md b/deploy_ai_search/README.md index 5df7adb..bd3ebb1 100644 --- a/deploy_ai_search/README.md +++ b/deploy_ai_search/README.md @@ -1,8 +1,8 @@ # AI Search Indexing Pre-built Index Setup -The associated scripts in this portion of the repository contains pre-built scripts to deploy the skillset with Azure Document Intelligence. +The associated scripts in this portion of the repository contains pre-built scripts to deploy the skillsets needed for both Text2SQL and Image Processing. -## Steps for Rag Documents Index Deployment (For Unstructured RAG) +## Steps for Rag Documents Index Deployment (For Image Processing) 1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication. 2. Adjust `rag_documents.py` with any changes to the index / indexer. The `get_skills()` method implements the skills pipeline. Make any adjustments here in the skills needed to enrich the data source. @@ -13,7 +13,7 @@ The associated scripts in this portion of the repository contains pre-built scri - `rebuild`. Whether to delete and rebuild the index. - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version. -## Steps for Text2SQL Index Deployment (For Structured RAG) +## Steps for Text2SQL Index Deployment (For Text2SQL) ### Schema Store Index diff --git a/text_2_sql/.env.example b/text_2_sql/.env.example index f08cf6d..21c358f 100644 --- a/text_2_sql/.env.example +++ b/text_2_sql/.env.example @@ -1,6 +1,11 @@ # Environment variables for Text2SQL IdentityType= # system_assigned or user_assigned or key +Text2Sql__DatabaseEngine= # TSQL or PostgreSQL or Snowflake or Databricks +Text2Sql__UseQueryCache= # True or False +Text2Sql__PreRunQueryCache= # True or False +Text2Sql__UseColumnValueStore= # True or False + # Open AI Connection Details OpenAI__CompletionDeployment= OpenAI__MiniCompletionDeployment= @@ -17,17 +22,20 @@ AIService__AzureSearchOptions__Text2SqlQueryCache__Index= AIService__AzureSearchOptions__Text2SqlColumnValueStore__Index= -# All SQL Engine specific connection details -Text2Sql__DatabaseName= +# TSQL +Text2Sql__Tsql__ConnectionString= +Text2Sql__Tsql__Database= -# TSQL or PostgreSQL Specific Connection Details -Text2Sql__DatabaseConnectionString= +# PostgreSQL Specific Connection Details +Text2Sql__Postgresql__ConnectionString= +Text2Sql__Postgresql__Database= # Snowflake Specific Connection Details Text2Sql__Snowflake__User= Text2Sql__Snowflake__Password= Text2Sql__Snowflake__Account= Text2Sql__Snowflake__Warehouse= +Text2Sql__Snowflake__Database= # Databricks Specific Connection Details Text2Sql__Databricks__Catalog= diff --git a/text_2_sql/GETTING_STARTED.md b/text_2_sql/GETTING_STARTED.md index 9acaea5..4338238 100644 --- a/text_2_sql/GETTING_STARTED.md +++ b/text_2_sql/GETTING_STARTED.md @@ -2,12 +2,25 @@ To get started, perform the following steps: +**Execute the following commands in the `deploy_ai_search` directory:** + 1. Setup Azure OpenAI in your subscription with **gpt-4o-mini** & an embedding model, alongside a SQL Server sample database, AI Search and a storage account. -2. Clone this repository and deploy the AI Search text2sql indexes from `deploy_ai_search`. -3. Run `uv sync` within the text_2_sql directory to install dependencies. +2. Create your `.env` file based on the provided sample `deploy_ai_search/.env.example`. Place this file in the same place in `deploy_ai_search/.env`. +3. Clone this repository and deploy the AI Search text2sql indexes from `deploy_ai_search`. See the instructions in the **Steps for Text2SQL Index Deployment (For Structured RAG)** section of the `deploy_ai_search/README.md`. + +**Execute the following commands in the `text_2_sql_core` directory:** + +4. Create your `.env` file based on the provided sample `text_2_sql/.env.example`. Place this file in the same place in `text_2_sql/.env`. +5. Run `uv sync` within the text_2_sql directory to install dependencies. + - Install the optional dependencies if you need a database connector other than TSQL. `uv sync --extra ` + - See the supported connectors in `text_2_sql_core/src/text_2_sql_core/connectors`. +6. Create your `.env` file based on the provided sample `text_2_sql/.env.example`. Place this file in the same place in `text_2_sql/.env`. +7. Generate a data dictionary for your target server using the instructions in the **Running** section of the `data_dictionary/README.md`. +8. Upload these generated data dictionaries files to the relevant containers in your storage account. Wait for them to be automatically indexed with the included skillsets. + +**Execute the following commands in the `autogen` directory:** + +9. Run `uv sync` within the text_2_sql directory to install dependencies. - Install the optional dependencies if you need a database connector other than TSQL. `uv sync --extra ` - See the supported connectors in `text_2_sql_core/src/text_2_sql_core/connectors`. -4. Create your `.env` file based on the provided sample `.env.example`. Place this file in the same place as the `.env.example`. -5. Generate a data dictionary for your target server using the instructions in the **Running** section of the `data_dictionary/README.md`. -6. Upload these generated data dictionaries files to the relevant containers in your storage account. Wait for them to be automatically indexed with the included skillsets. -7. Navigate to `autogen` directory to view the AutoGen implementation. Follow the steps in `Iteration 5 - Agentic Vector Based Text2SQL.ipynb` to get started. +10. Navigate to `autogen` directory to view the AutoGen implementation. Follow the steps in `Iteration 5 - Agentic Vector Based Text2SQL.ipynb` to get started. diff --git a/text_2_sql/__init__.py b/text_2_sql/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/text_2_sql/autogen/evaluate_autogen_text2sql.ipynb b/text_2_sql/autogen/evaluate_autogen_text2sql.ipynb index 2b38a09..16c8c16 100644 --- a/text_2_sql/autogen/evaluate_autogen_text2sql.ipynb +++ b/text_2_sql/autogen/evaluate_autogen_text2sql.ipynb @@ -215,8 +215,8 @@ " \n", " # Update database connection string for current database\n", " db_path = DATABASE_DIR / db_id / f\"{db_id}.sqlite\"\n", - " os.environ[\"Text2Sql__DatabaseConnectionString\"] = str(db_path)\n", - " os.environ[\"Text2Sql__DatabaseName\"] = db_id\n", + " os.environ[\"Text2Sql__Tsql__ConnectionString\"] = str(db_path)\n", + " os.environ[\"Text2Sql__Database\"] = db_id\n", " \n", " sql = await generate_sql(question)\n", " predictions.append(f\"{sql}\\t{db_id}\")\n", diff --git a/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/parallel_query_solving_agent.py b/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/parallel_query_solving_agent.py index e343729..bbd7758 100644 --- a/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/parallel_query_solving_agent.py +++ b/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/parallel_query_solving_agent.py @@ -219,12 +219,12 @@ async def consume_inner_messages_from_agentic_flow( # Add database connection info to injected parameters query_params = injected_parameters.copy() if injected_parameters else {} - if "Text2Sql__DatabaseConnectionString" in os.environ: + if "Text2Sql__Tsql__ConnectionString" in os.environ: query_params["database_connection_string"] = os.environ[ - "Text2Sql__DatabaseConnectionString" + "Text2Sql__Tsql__ConnectionString" ] - if "Text2Sql__DatabaseName" in os.environ: - query_params["database_name"] = os.environ["Text2Sql__DatabaseName"] + if "Text2Sql__Tsql__Database" in os.environ: + query_params["database_name"] = os.environ["Text2Sql__Tsql__Database"] # Launch tasks for each sub-query inner_solving_generators.append( diff --git a/text_2_sql/autogen/src/autogen_text_2_sql/inner_autogen_text_2_sql.py b/text_2_sql/autogen/src/autogen_text_2_sql/inner_autogen_text_2_sql.py index a83000d..454ec33 100644 --- a/text_2_sql/autogen/src/autogen_text_2_sql/inner_autogen_text_2_sql.py +++ b/text_2_sql/autogen/src/autogen_text_2_sql/inner_autogen_text_2_sql.py @@ -45,27 +45,27 @@ def __init__(self, **kwargs: dict): self.set_mode() # Store original environment variables - self.original_db_conn = os.environ.get("Text2Sql__DatabaseConnectionString") - self.original_db_name = os.environ.get("Text2Sql__DatabaseName") + self.original_db_conn = os.environ.get("Text2Sql__Tsql__ConnectionString") + self.original_db_name = os.environ.get("Text2Sql__Tsql__Database") def _update_environment(self, injected_parameters: dict = None): """Update environment variables with injected parameters.""" if injected_parameters: if "database_connection_string" in injected_parameters: - os.environ["Text2Sql__DatabaseConnectionString"] = injected_parameters[ + os.environ["Text2Sql__Tsql__ConnectionString"] = injected_parameters[ "database_connection_string" ] if "database_name" in injected_parameters: - os.environ["Text2Sql__DatabaseName"] = injected_parameters[ + os.environ["Text2Sql__Tsql__Database"] = injected_parameters[ "database_name" ] def _restore_environment(self): """Restore original environment variables.""" if self.original_db_conn: - os.environ["Text2Sql__DatabaseConnectionString"] = self.original_db_conn + os.environ["Text2Sql__Tsql__ConnectionString"] = self.original_db_conn if self.original_db_name: - os.environ["Text2Sql__DatabaseName"] = self.original_db_name + os.environ["Text2Sql__Tsql__Database"] = self.original_db_name def set_mode(self): """Set the mode of the plugin based on the environment variables.""" diff --git a/text_2_sql/data_dictionary/README.md b/text_2_sql/data_dictionary/README.md index 92dd92f..d2002ae 100644 --- a/text_2_sql/data_dictionary/README.md +++ b/text_2_sql/data_dictionary/README.md @@ -224,7 +224,11 @@ If there is no pre-built script for your database engine, take one of the above ## Running -1. Create your `.env` file based on the provided sample `.env.example`. Place this file in the same place as the `.env.example`. +To generate a data dictionary, perform the following steps: + +1. Create your `.env` file based on the provided sample `text_2_sql/.env.example`. Place this file in the same place in `text_2_sql/.env`. + +**Execute the following commands in the `text_2_sql_core` directory:** 2. Package and install the `text_2_sql_core` library. See [build](https://docs.astral.sh/uv/concepts/projects/build/) if you want to build as a wheel and install on an agent. Or you can run from within a `uv` environment and skip packaging. - Install the optional dependencies if you need a database connector other than TSQL. `uv sync --extra ` 3. Run `data_dictionary ` diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/postgresql_sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/postgresql_sql.py index f106041..4192e8d 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/postgresql_sql.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/postgresql_sql.py @@ -16,6 +16,11 @@ def __init__(self): self.database_engine = DatabaseEngine.POSTGRESQL + @property + def engine_specific_rules(self) -> str: + """Get the engine specific rules.""" + return "" + @property def engine_specific_fields(self) -> list[str]: """Get the engine specific fields.""" @@ -61,10 +66,10 @@ async def query_execution( """ logging.info(f"Running query: {sql_query}") results = [] - connection_string = os.environ["Text2Sql__DatabaseConnectionString"] + connection_string = os.environ["Text2Sql__Postgresql__ConnectionString"] # Establish an asynchronous connection to the PostgreSQL database - async with psycopg.AsyncConnection.connect(connection_string) as conn: + async with await psycopg.AsyncConnection.connect(connection_string) as conn: # Create an asynchronous cursor async with conn.cursor() as cursor: await cursor.execute(sql_query) diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/snowflake_sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/snowflake_sql.py index 5f40627..49d7a43 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/snowflake_sql.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/snowflake_sql.py @@ -100,7 +100,7 @@ async def query_execution( password=os.environ["Text2Sql__Snowflake__Password"], account=os.environ["Text2Sql__Snowflake__Account"], warehouse=os.environ["Text2Sql__Snowflake__Warehouse"], - database=os.environ["Text2Sql__DatabaseName"], + database=os.environ["Text2Sql__Snowflake__Database"], ) try: diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/sqlite_sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/sqlite_sql.py index 16548cd..5e35df6 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/sqlite_sql.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/sqlite_sql.py @@ -63,7 +63,7 @@ async def query_execution( Returns: List of dictionaries containing query results. """ - db_file = os.environ["Text2Sql__DatabaseConnectionString"] + db_file = os.environ["Text2Sql__Tsql__ConnectionString"] if not os.path.exists(db_file): raise FileNotFoundError(f"Database file not found: {db_file}") @@ -127,7 +127,9 @@ def find_matching_tables(self, text: str, table_names: list[str]) -> list[int]: List of matching table indices """ matches = [] - logging.info(f"Looking for tables matching '{text}' in tables: {table_names}") + logging.info( + "Looking for tables matching '%s' in tables: %s", text, table_names + ) # First try exact matches for idx, name in enumerate(table_names): @@ -144,7 +146,9 @@ def find_matching_tables(self, text: str, table_names: list[str]) -> list[int]: for idx, name in enumerate(table_names): table_terms = set(re.split(r"[_\s]+", name.lower())) if search_terms & table_terms: # If there's any overlap in terms - logging.info(f"Found partial match: '{name}' with terms {table_terms}") + logging.info( + "Found partial match: '%s' with terms %s", name, table_terms + ) matches.append(idx) return matches @@ -181,7 +185,7 @@ async def get_entity_schemas( spider_schemas = json.load(f) # Get current database name from path - db_path = os.environ["Text2Sql__DatabaseConnectionString"] + db_path = os.environ["Text2Sql__Tsql__ConnectionString"] db_name = os.path.splitext(os.path.basename(db_path))[0] logging.info(f"Looking for schemas in database: {db_name}") @@ -196,7 +200,7 @@ async def get_entity_schemas( if not db_schema: raise ValueError(f"Schema not found for database: {db_name}") - logging.info(f"Looking for tables matching '{text}' in database '{db_name}'") + logging.info("Looking for tables matching '%s' in database '%s'", text, db_name) logging.info(f"Available tables: {db_schema['table_names']}") # Find all matching tables using flexible matching @@ -228,7 +232,9 @@ async def get_entity_schemas( } schemas.append(schema) logging.info( - f"Added schema for table '{db_schema['table_names'][table_idx]}': {schema}" + "Added schema for table '%s': %s", + db_schema["table_names"][table_idx], + schema, ) if as_json: diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/tsql_sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/tsql_sql.py index 3cf6bcd..adca1f8 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/tsql_sql.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/tsql_sql.py @@ -86,7 +86,7 @@ async def query_execution( """ logging.info(f"Running query: {sql_query}") results = [] - connection_string = os.environ["Text2Sql__DatabaseConnectionString"] + connection_string = os.environ["Text2Sql__Tsql__ConnectionString"] async with await aioodbc.connect(dsn=connection_string) as sql_db_client: async with sql_db_client.cursor() as cursor: await cursor.execute(sql_query) diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/cli.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/cli.py index 586f466..9c858ae 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/cli.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/cli.py @@ -5,6 +5,7 @@ import logging import typer from rich import print as rich_print +from tenacity import RetryError logging.basicConfig(level=logging.INFO) @@ -112,8 +113,18 @@ def create( try: asyncio.run(data_dictionary_creator.create_data_dictionary()) + except RetryError as e: + # Fetch the actual exception + e = e.last_attempt.exception() + logging.error(e) + rich_print("Text2SQL Data Dictionary Creator Failed ❌") + + rich_print(f"Error Messages: {e}") + + raise typer.Exit(code=1) except Exception as e: logging.error(e) + rich_print("Text2SQL Data Dictionary Creator Failed ❌") rich_print(f"Error Messages: {e}") diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/data_dictionary_creator.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/data_dictionary_creator.py index e61c138..890d5e9 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/data_dictionary_creator.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/data_dictionary_creator.py @@ -21,7 +21,7 @@ class ForeignKeyRelationship(BaseModel): column: str = Field(..., alias="Column") foreign_column: str = Field(..., alias="ForeignColumn") - model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True) + model_config = ConfigDict(populate_by_name=True) class EntityRelationship(BaseModel): @@ -39,7 +39,7 @@ class EntityRelationship(BaseModel): foreign_database: Optional[str] = Field(default=None, alias="ForeignDatabase") foreign_catalog: Optional[str] = Field(default=None, alias="ForeignCatalog") - model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True) + model_config = ConfigDict(populate_by_name=True) def pivot(self): """A method to pivot the entity relationship.""" @@ -128,12 +128,10 @@ class ColumnItem(BaseModel): name: str = Field(..., alias="Name") data_type: str = Field(..., alias="DataType") definition: Optional[str] = Field(..., alias="Definition") - distinct_values: Optional[list[any]] = Field( - None, alias="DistinctValues", exclude=True - ) - sample_values: Optional[list[any]] = Field(None, alias="SampleValues") + distinct_values: Optional[list] = Field(None, alias="DistinctValues", exclude=True) + sample_values: Optional[list] = Field(None, alias="SampleValues") - model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True) + model_config = ConfigDict(populate_by_name=True) def value_store_entry( self, entity, distinct_value, excluded_fields_for_database_engine diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/postgresql_data_dictionary_creator.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/postgresql_data_dictionary_creator.py index d511659..a99b56f 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/postgresql_data_dictionary_creator.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/postgresql_data_dictionary_creator.py @@ -16,7 +16,7 @@ def __init__(self, **kwargs): excluded_schemas = ["information_schema", "pg_catalog"] super().__init__(excluded_schemas=excluded_schemas, **kwargs) - self.database = os.environ["Text2Sql__DatabaseName"] + self.database = os.environ["Text2Sql__Postgresql__Database"] self.database_engine = DatabaseEngine.POSTGRESQL self.sql_connector = PostgresqlSqlConnector() diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/snowflake_data_dictionary_creator.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/snowflake_data_dictionary_creator.py index fc5a1de..b4570ee 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/snowflake_data_dictionary_creator.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/snowflake_data_dictionary_creator.py @@ -18,7 +18,7 @@ def __init__(self, **kwargs): excluded_schemas = ["INFORMATION_SCHEMA"] super().__init__(excluded_schemas=excluded_schemas, **kwargs) - self.database = os.environ["Text2Sql__DatabaseName"] + self.database = os.environ["Text2Sql__Snowflake__Database"] self.warehouse = os.environ["Text2Sql__Snowflake__Warehouse"] self.database_engine = DatabaseEngine.SNOWFLAKE diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/tsql_data_dictionary_creator.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/tsql_data_dictionary_creator.py index 2d36085..7557f4b 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/tsql_data_dictionary_creator.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/data_dictionary/tsql_data_dictionary_creator.py @@ -21,7 +21,7 @@ def __init__(self, **kwargs): """ excluded_schemas = ["dbo", "sys"] super().__init__(excluded_schemas=excluded_schemas, **kwargs) - self.database = os.environ["Text2Sql__DatabaseName"] + self.database = os.environ["Text2Sql__Tsql__Database"] self.database_engine = DatabaseEngine.TSQL diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/utils/environment.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/utils/environment.py index 232254e..c43edec 100644 --- a/text_2_sql/text_2_sql_core/src/text_2_sql_core/utils/environment.py +++ b/text_2_sql/text_2_sql_core/src/text_2_sql_core/utils/environment.py @@ -18,7 +18,7 @@ def get_identity_type() -> IdentityType: Returns: IdentityType: The identity type """ - identity = os.environ.get("IdentityType") + identity = os.environ["IdentityType"] if identity == "user_assigned": return IdentityType.USER_ASSIGNED