diff --git a/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json new file mode 100644 index 0000000..aafd623 --- /dev/null +++ b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json @@ -0,0 +1,48 @@ +{ + "name": "Azure DocumentDB Select Algorithm - .NET 8", + "image": "mcr.microsoft.com/devcontainers/dotnet:1-8.0-bookworm", + + "features": { + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/devcontainers/features/github-cli:1": {}, + "ghcr.io/devcontainers/features/common-utils:2": { + "installZsh": true, + "configureZshAsDefaultShell": true, + "installOhMyZsh": true + } + }, + + "customizations": { + "vscode": { + "extensions": [ + "ms-dotnettools.csdevkit", + "ms-dotnettools.vscodeintellicode-csharp", + "ms-azuretools.vscode-azureresourcegroups", + "ms-azuretools.vscode-cosmosdb", + "mongodb.mongodb-vscode" + ], + "settings": { + "dotnet.completion.showCompletionItemsFromUnimportedNamespaces": true, + "files.exclude": { + "**/bin": true, + "**/obj": true + } + } + } + }, + + "postCreateCommand": "dotnet restore && dotnet build", + "remoteUser": "vscode", + + "containerEnv": { + "DOTNET_CLI_TELEMETRY_OPTOUT": "1", + "DOTNET_NOLOGO": "1" + }, + + "mounts": [ + "source=${localEnv:HOME}${localEnv:USERPROFILE}/.azure,target=/home/vscode/.azure,type=bind,consistency=cached" + ], + + "capAdd": ["SYS_PTRACE"], + "securityOpt": ["seccomp:unconfined"] +} diff --git a/ai/select-algorithm-dotnet/.gitignore b/ai/select-algorithm-dotnet/.gitignore new file mode 100644 index 0000000..de285c3 --- /dev/null +++ b/ai/select-algorithm-dotnet/.gitignore @@ -0,0 +1,7 @@ +bin/ +obj/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-dotnet/CompareAll.cs b/ai/select-algorithm-dotnet/CompareAll.cs new file mode 100644 index 0000000..5a763b2 --- /dev/null +++ b/ai/select-algorithm-dotnet/CompareAll.cs @@ -0,0 +1,259 @@ +/// Unified comparison runner for all 9 combinations (3 algorithms × 3 similarity metrics). +/// Executes vector searches sequentially for fair timing and prints a formatted comparison table. + +namespace SelectAlgorithm; + +using MongoDB.Driver; +using MongoDB.Bson; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +public static class CompareAll +{ + private record IndexConfig(string Name, string Kind, string Similarity, BsonDocument ExtraParams); + + private record SearchResult(string Algorithm, string Metric, string Top1Name, double Top1Score, string Top2Name, double Top2Score); + + private static string GetAlgoDisplay(string kind) => kind switch + { + "vector-ivf" => "IVF", + "vector-hnsw" => "HNSW", + "vector-diskann" => "DiskANN", + _ => kind + }; + + public static void Run(AppConfiguration appConfig) + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" Compare All Algorithms × Metrics"); + Console.WriteLine(" 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP"); + Console.WriteLine(new string('=', 60)); + + // Use config values with env var overrides for compare-specific settings + var databaseName = appConfig.MongoDB.DatabaseName; + var dataFile = appConfig.DataFiles.WithVectors; + var vectorField = appConfig.Embedding.EmbeddedField; + var dimensions = appConfig.Embedding.Dimensions; + var batchSize = appConfig.MongoDB.LoadBatchSize; + var queryText = Environment.GetEnvironmentVariable("QUERY_TEXT") ?? "luxury hotel near the beach"; + var topK = int.Parse(Environment.GetEnvironmentVariable("TOP_K") ?? "5"); + + var mongoClient = Utils.GetMongoClientPasswordless(appConfig); + var embeddingClient = Utils.GetEmbeddingClient(appConfig); + + try + { + var database = mongoClient.GetDatabase(databaseName); + + // Drop collection for a clean comparison + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + + var collection = database.GetCollection("hotels"); + + // Load data once into single collection + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + Utils.InsertData(collection, documents, batchSize); + + // Generate ONE embedding for the query (reused for all 9 searches) + Console.WriteLine($"\nQuery: \"{queryText}\""); + Console.WriteLine($"Top K: {topK}"); + var embeddingResult = embeddingClient.GenerateEmbedding(queryText); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + Console.WriteLine("Embedding generated (reused for all searches)\n"); + + // Define 9 index configurations + var configs = BuildIndexConfigs(dimensions); + + // Run each config sequentially: drop→create→wait→search + // DocumentDB doesn't allow multiple vector indexes of the same kind on the same field + Console.WriteLine("Running 9 algorithm × metric combinations...\n"); + var results = new List(); + foreach (var config in configs) + { + // 1. Drop all existing vector indexes + DropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + CreateIndex(collection, vectorField, config); + Console.WriteLine($" ✓ {config.Name} created"); + + // 3. Wait for index to build + Thread.Sleep(5000); + + // 4. Search + var searchResults = RunVectorSearch(collection, queryVector, vectorField, config.Name, topK); + + // 5. Extract top 2 results and record + var algoDisplay = GetAlgoDisplay(config.Kind); + var top1Name = "-"; var top1Score = 0.0; + var top2Name = "-"; var top2Score = 0.0; + if (searchResults.Count > 0) + { + var doc1 = searchResults[0]; + top1Name = doc1.Contains("HotelName") ? doc1["HotelName"].AsString : "Unknown"; + top1Score = doc1.Contains("score") ? doc1["score"].ToDouble() : 0.0; + } + if (searchResults.Count > 1) + { + var doc2 = searchResults[1]; + top2Name = doc2.Contains("HotelName") ? doc2["HotelName"].AsString : "Unknown"; + top2Score = doc2.Contains("score") ? doc2["score"].ToDouble() : 0.0; + } + results.Add(new SearchResult(algoDisplay, config.Similarity, top1Name, top1Score, top2Name, top2Score)); + } + + // Print comparison table + PrintComparisonTable(results); + } + finally + { + // Cleanup: drop the comparison collection + try + { + var database = mongoClient.GetDatabase(databaseName); + database.DropCollection("hotels"); + Console.WriteLine("\nCleanup: dropped collection 'hotels'"); + } + catch (Exception ex) + { + Console.WriteLine($"Cleanup warning: {ex.Message}"); + } + mongoClient.Cluster.Dispose(); + } + } + + private static List BuildIndexConfigs(int dimensions) + { + string[] metrics = ["COS", "L2", "IP"]; + var configs = new List(); + + // IVF + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_ivf_{metric.ToLower()}", "vector-ivf", metric, new BsonDocument { { "numLists", 1 } })); + + // HNSW + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_hnsw_{metric.ToLower()}", "vector-hnsw", metric, new BsonDocument { { "m", 16 }, { "efConstruction", 64 } })); + + // DiskANN + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_diskann_{metric.ToLower()}", "vector-diskann", metric, new BsonDocument { { "maxDegree", 20 }, { "lBuild", 10 } })); + + return configs; + } + + private static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + foreach (var idx in cursor.ToList()) + { + var name = idx.GetValue("name", "").AsString; + var key = idx.GetValue("key", new BsonDocument()).AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + try { collection.Indexes.DropOne(name); } catch { } + } + } + } + catch { } + } + + private static void CreateIndex(IMongoCollection collection, string vectorField, IndexConfig config) + { + // Drop existing index with same name if present + try + { + collection.Indexes.DropOne(config.Name); + } + catch (MongoCommandException) + { + // Index doesn't exist, that's fine + } + + var cosmosSearchOptions = new BsonDocument + { + { "kind", config.Kind }, + { "dimensions", int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536") }, + { "similarity", config.Similarity } + }; + + foreach (var param in config.ExtraParams) + { + cosmosSearchOptions.Add(param); + } + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", config.Name }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", cosmosSearchOptions } + } + } + } + }; + + try + { + collection.Database.RunCommand(command); + } + catch (MongoCommandException ex) when (ex.Message.Contains("already exists")) + { + // Index already exists with same config — idempotent + } + } + + private static List RunVectorSearch( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK) + { + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "HotelName", 1 }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + private static void PrintComparisonTable(List results) + { + Console.WriteLine(); + Console.WriteLine("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐"); + Console.WriteLine($"│ {"Algorithm",-9}│ {"Metric",-7}│ {"Top 1 Result",-27}│ {"Score",-7}│ {"Top 2 Result",-27}│ {"Score",-7}│ {"Diff",-6}│"); + Console.WriteLine("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + + for (var i = 0; i < results.Count; i++) + { + var r = results[i]; + var diff = Math.Abs(r.Top1Score - r.Top2Score); + var top1Display = r.Top1Name.Length > 27 ? r.Top1Name[..24] + "..." : r.Top1Name; + var top2Display = r.Top2Name.Length > 27 ? r.Top2Name[..24] + "..." : r.Top2Name; + Console.WriteLine($"│ {r.Algorithm,-9}│ {r.Metric,-7}│ {top1Display,-27}│ {r.Top1Score,-7:F4}│ {top2Display,-27}│ {r.Top2Score,-7:F4}│ {diff,-6:F4}│"); + if (i < results.Count - 1) + Console.WriteLine("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + } + Console.WriteLine("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘"); + } +} diff --git a/ai/select-algorithm-dotnet/Models/Configuration.cs b/ai/select-algorithm-dotnet/Models/Configuration.cs new file mode 100644 index 0000000..cbca25b --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/Configuration.cs @@ -0,0 +1,41 @@ +namespace SelectAlgorithm.Models; + +public class AppConfiguration +{ + public AzureOpenAIConfiguration AzureOpenAI { get; set; } = new(); + public MongoDBConfiguration MongoDB { get; set; } = new(); + public EmbeddingConfiguration Embedding { get; set; } = new(); + public VectorSearchConfiguration VectorSearch { get; set; } = new(); + public DataFilesConfiguration DataFiles { get; set; } = new(); +} + +public class AzureOpenAIConfiguration +{ + public string Endpoint { get; set; } = string.Empty; + public string EmbeddingModel { get; set; } = "text-embedding-3-small"; +} + +public class MongoDBConfiguration +{ + public string ClusterName { get; set; } = string.Empty; + public string DatabaseName { get; set; } = "Hotels"; + public int LoadBatchSize { get; set; } = 100; +} + +public class EmbeddingConfiguration +{ + public string EmbeddedField { get; set; } = "DescriptionVector"; + public int Dimensions { get; set; } = 1536; +} + +public class VectorSearchConfiguration +{ + public string Query { get; set; } = "luxury hotel near the beach"; + public string Similarity { get; set; } = ""; + public int TopK { get; set; } = 5; +} + +public class DataFilesConfiguration +{ + public string WithVectors { get; set; } = "data/Hotels_Vector.json"; +} diff --git a/ai/select-algorithm-dotnet/Models/HotelData.cs b/ai/select-algorithm-dotnet/Models/HotelData.cs new file mode 100644 index 0000000..4821ee3 --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/HotelData.cs @@ -0,0 +1,19 @@ +using MongoDB.Bson; +using MongoDB.Bson.Serialization.Attributes; + +namespace SelectAlgorithm.Models; + +public class HotelData +{ + [BsonId] + [BsonRepresentation(BsonType.ObjectId)] + public string? Id { get; set; } + + public string HotelId { get; set; } = string.Empty; + public string HotelName { get; set; } = string.Empty; + public string Description { get; set; } = string.Empty; + public string Category { get; set; } = string.Empty; + + [BsonExtraElements] + public BsonDocument? ExtraElements { get; set; } +} diff --git a/ai/select-algorithm-dotnet/Program.cs b/ai/select-algorithm-dotnet/Program.cs new file mode 100644 index 0000000..37992ad --- /dev/null +++ b/ai/select-algorithm-dotnet/Program.cs @@ -0,0 +1,40 @@ +using Microsoft.Extensions.Configuration; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +class Program +{ + static void Main(string[] args) + { + Console.WriteLine(); + Console.WriteLine("Select Algorithm Demo - Azure DocumentDB Vector Search (.NET)"); + Console.WriteLine(new string('-', 60)); + Console.WriteLine(); + + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) + .AddEnvironmentVariables() + .Build(); + + var appConfig = new AppConfiguration(); + configuration.Bind(appConfig); + + var command = args.Length > 0 ? args[0].ToLower() : "compare-all"; + + switch (command) + { + case "compare-all": + CompareAll.Run(appConfig); + break; + default: + Console.WriteLine($"Unknown command: {command}"); + Console.WriteLine("Usage: dotnet run -- compare-all"); + return; + } + + Console.WriteLine(); + Console.WriteLine("Done!"); + } +} diff --git a/ai/select-algorithm-dotnet/README.md b/ai/select-algorithm-dotnet/README.md new file mode 100644 index 0000000..0263013 --- /dev/null +++ b/ai/select-algorithm-dotnet/README.md @@ -0,0 +1,137 @@ +# Select Algorithm - .NET (C#) + +Demonstrates three vector index algorithms available in Azure DocumentDB: + +| Algorithm | Best For | Cluster Tier | Key Parameters | +|-----------|----------|--------------|----------------| +| **IVF** | < 10,000 documents | M10+ | `numLists` | +| **HNSW** | 10,000–50,000 documents | M30+ | `m`, `efConstruction` | +| **DiskANN** | 50,000+ documents | M40+ | `maxDegree`, `lBuild` | + +## Prerequisites + +- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) +- Azure DocumentDB cluster +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. **Configure environment:** + + The .NET sample uses `appsettings.json` for configuration. After deploying with `azd up`, you can export values: + + ```bash + azd env get-values + ``` + + Then update `appsettings.json` with your Azure resource values. + +2. Edit `appsettings.json` with your configuration: + + ```json + { + "AzureOpenAI": { + "EmbeddingModel": "text-embedding-3-small", + "Endpoint": "https://.openai.azure.com" + }, + "MongoDB": { + "ClusterName": "", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + }, + "DataFiles": { + "WithVectors": "../data/Hotels_Vector.json" + } + } + ``` + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +4. Restore packages: + + ```bash + dotnet restore + ``` + +## Usage + +Run all 9 combinations (default): + +```bash +dotnet run +``` + +## Configuration + +| Setting (appsettings.json) | Default | Description | +|---------------------------|---------|-------------| +| `MongoDB:ClusterName` | (required) | DocumentDB cluster name | +| `AzureOpenAI:Endpoint` | (required) | Azure OpenAI endpoint | +| `AzureOpenAI:EmbeddingModel` | (required) | Embedding model deployment name | +| `DataFiles:WithVectors` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `Embedding:EmbeddedField` | `DescriptionVector` | Field name containing embeddings | +| `Embedding:Dimensions` | `1536` | Vector dimensions | +| `MongoDB:DatabaseName` | `Hotels` | Target database name | +| `MongoDB:LoadBatchSize` | `100` | Batch size for data loading | +| `Embedding:EmbeddingSizeBatch` | `16` | Batch size for embedding requests | + +**Additional environment variables for compare mode:** + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Show detailed per-result output | + +## How It Works + +1. **Connect** to DocumentDB using Microsoft Entra ID (OIDC) passwordless authentication +2. **Load** hotel documents with pre-computed embeddings from `Hotels_Vector.json` +3. For each of 9 algorithm/metric combinations: creates the index → searches → drops the index +4. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +5. Prints a formatted comparison table with scores, top results, and key insights + +## Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=20, lBuild=10 | + +## Authentication + +This sample uses `DefaultAzureCredential` for both: +- **DocumentDB**: OIDC-based MongoDB authentication +- **Azure OpenAI**: Token-based authentication with `https://cognitiveservices.azure.com/.default` scope + +Ensure you are logged in with `az login` and have appropriate RBAC roles assigned. + +## Project Structure + +``` +select-algorithm-dotnet/ +├── .devcontainer/ +│ └── devcontainer.json # Dev container configuration +├── Models/ +│ ├── Configuration.cs # App configuration model +│ └── HotelData.cs # Hotel document model +├── Utilities/ +│ └── AzureIdentityTokenHandler.cs # OIDC token handler +├── appsettings.json # Configuration file +├── CompareAll.cs # Unified 9-combination comparison runner +├── Program.cs # Entry point +├── README.md # This file +├── SelectAlgorithm.csproj # Project file +└── Utils.cs # Shared helpers (connection, embedding, search) +``` diff --git a/ai/select-algorithm-dotnet/SelectAlgorithm.csproj b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj new file mode 100644 index 0000000..331e522 --- /dev/null +++ b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj @@ -0,0 +1,23 @@ + + + Exe + net8.0 + enable + enable + SelectAlgorithm + + + + + + + + + + + + + PreserveNewest + + + diff --git a/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs new file mode 100644 index 0000000..eca94fd --- /dev/null +++ b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs @@ -0,0 +1,32 @@ +using Azure.Core; +using MongoDB.Driver.Authentication.Oidc; + +namespace SelectAlgorithm.Utilities; + +internal sealed class AzureIdentityTokenHandler( + TokenCredential credential, + string? tenantId +) : IOidcCallback +{ + private readonly string[] scopes = ["https://ossrdbms-aad.database.windows.net/.default"]; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = credential.GetToken( + new TokenRequestContext(scopes, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = await credential.GetTokenAsync( + new TokenRequestContext(scopes, parentRequestId: null, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} diff --git a/ai/select-algorithm-dotnet/Utils.cs b/ai/select-algorithm-dotnet/Utils.cs new file mode 100644 index 0000000..62590ad --- /dev/null +++ b/ai/select-algorithm-dotnet/Utils.cs @@ -0,0 +1,190 @@ +using MongoDB.Driver; +using MongoDB.Driver.Authentication.Oidc; +using MongoDB.Bson; +using MongoDB.Bson.Serialization; +using Azure.Identity; +using Azure.Core; +using Azure.AI.OpenAI; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +public class AzureOidcCallback : IOidcCallback +{ + private readonly DefaultAzureCredential _credential; + private static readonly string[] Scopes = { "https://ossrdbms-aad.database.windows.net/.default" }; + + public AzureOidcCallback(DefaultAzureCredential credential) => _credential = credential; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = _credential.GetToken(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = await _credential.GetTokenAsync(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} + +public static class Utils +{ + public static IMongoClient GetMongoClientPasswordless(AppConfiguration config) + { + var clusterName = config.MongoDB.ClusterName; + if (string.IsNullOrEmpty(clusterName)) + throw new InvalidOperationException("MongoDB:ClusterName is required in appsettings.json"); + + var credential = new DefaultAzureCredential(); + + var connectionString = $"mongodb+srv://{clusterName}.global.mongocluster.cosmos.azure.com/"; + var settings = MongoClientSettings.FromConnectionString(connectionString); + settings.ConnectTimeout = TimeSpan.FromSeconds(120); + settings.UseTls = true; + settings.RetryWrites = false; + + // Custom OIDC callback using DefaultAzureCredential + // Chains through CLI, managed identity, etc. + var oidcCallback = new AzureOidcCallback(credential); + settings.Credential = MongoCredential.CreateOidcCredential(oidcCallback, null); + + return new MongoClient(settings); + } + + public static EmbeddingClient GetEmbeddingClient(AppConfiguration config) + { + var endpoint = config.AzureOpenAI.Endpoint; + if (string.IsNullOrEmpty(endpoint)) + throw new InvalidOperationException("AzureOpenAI:Endpoint is required in appsettings.json"); + + var model = config.AzureOpenAI.EmbeddingModel; + + var credential = new DefaultAzureCredential(); + var azureClient = new AzureOpenAIClient(new Uri(endpoint), credential); + return azureClient.GetEmbeddingClient(model); + } + + public static List ReadJsonFile(string path) + { + if (!File.Exists(path)) + throw new FileNotFoundException($"Data file not found: {path}"); + + var json = File.ReadAllText(path); + return BsonSerializer.Deserialize>(json); + } + + public static void InsertData(IMongoCollection collection, List data, int batchSize) + { + var totalDocuments = data.Count; + var existingCount = collection.CountDocuments(new BsonDocument()); + + if (existingCount >= totalDocuments) + { + Console.WriteLine($"Collection already has {existingCount} documents, skipping insert"); + return; + } + + if (existingCount > 0) + { + collection.DeleteMany(new BsonDocument()); + } + + var insertedCount = 0; + for (var i = 0; i < totalDocuments; i += batchSize) + { + var batch = data.Skip(i).Take(batchSize).ToList(); + try + { + collection.InsertMany(batch, new InsertManyOptions { IsOrdered = false }); + insertedCount += batch.Count; + } + catch (MongoBulkWriteException) + { + // Some documents may have been inserted before the error + insertedCount += batch.Count; + } + Thread.Sleep(100); + } + + Console.WriteLine($"Inserted {insertedCount}/{totalDocuments} documents"); + } + + public static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + var indexes = cursor.ToList(); + foreach (var index in indexes) + { + if (index.Contains("key")) + { + var key = index["key"].AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + var indexName = index["name"].AsString; + collection.Indexes.DropOne(indexName); + Console.WriteLine($"Dropped existing vector index: {indexName}"); + } + } + } + } + catch (Exception ex) + { + Console.WriteLine($"Warning: Error dropping indexes: {ex.Message}"); + } + } + + public static List PerformVectorSearch( + IMongoCollection collection, + EmbeddingClient client, + string query, + string vectorField, + string model, + int topK = 5) + { + var embeddingResult = client.GenerateEmbedding(query); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "document", "$$ROOT" }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + public static void PrintSearchResults(List results, string algorithm) + { + Console.WriteLine(); + Console.WriteLine(new string('=', 60)); + Console.WriteLine($" {algorithm} Search Results ({results.Count} found)"); + Console.WriteLine(new string('=', 60)); + + for (var i = 0; i < results.Count; i++) + { + var result = results[i]; + var doc = result.Contains("document") ? result["document"].AsBsonDocument : result; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString + : doc.Contains("name") ? doc["name"].AsString + : "Unknown"; + var score = result.Contains("score") ? result["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + + Console.WriteLine(); + } +} diff --git a/ai/select-algorithm-dotnet/appsettings.json b/ai/select-algorithm-dotnet/appsettings.json new file mode 100644 index 0000000..5572a48 --- /dev/null +++ b/ai/select-algorithm-dotnet/appsettings.json @@ -0,0 +1,24 @@ +{ + "AzureOpenAI": { + "Endpoint": "https://oaidctfqpct77ndi.openai.azure.com/", + "EmbeddingModel": "text-embedding-3-small" + }, + "MongoDB": { + "ClusterName": "docdb-dctfqpct77ndi", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + }, + "VectorSearch": { + "Query": "quintessential lodging near running trails, eateries, retail", + "Similarity": "", + "TopK": 5 + }, + "DataFiles": { + "WithVectors": "data/Hotels_Vector.json" + } +} diff --git a/ai/select-algorithm-dotnet/data/README.md b/ai/select-algorithm-dotnet/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-dotnet/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-dotnet/output/compare_all.txt b/ai/select-algorithm-dotnet/output/compare_all.txt new file mode 100644 index 0000000..4f4d995 --- /dev/null +++ b/ai/select-algorithm-dotnet/output/compare_all.txt @@ -0,0 +1,47 @@ +============================================================ + Compare All Algorithms × Metrics + 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP +============================================================ +Dropped existing 'hotels' collection (if any) + +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Query: "luxury hotel near the beach" +Top K: 5 +Embedding generated (reused for all searches) + +Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-go/.env.example b/ai/select-algorithm-go/.env.example new file mode 100644 index 0000000..8ea8381 --- /dev/null +++ b/ai/select-algorithm-go/.env.example @@ -0,0 +1,43 @@ +# DocumentDB Configuration +# Name of the DocumentDB cluster (used for passwordless OIDC authentication) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI Embedding Configuration +# Azure OpenAI service endpoint URL +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Azure OpenAI API version for embeddings +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 + +# Database name in DocumentDB +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Path to JSON file with generated vector embeddings +DATA_FILE_WITH_VECTORS=./Hotels_Vector.json + +# Name of the field where embeddings are stored +EMBEDDED_FIELD=DescriptionVector + +# Number of dimensions in the embedding vectors (1536 for text-embedding-3-small) +EMBEDDING_DIMENSIONS=1536 + +# Number of records to load per batch during data insertion +LOAD_SIZE_BATCH=100 + +# Algorithm to run: leave empty to run all, or set to: ivf, hnsw, diskann +ALGORITHM= + +# SIMILARITY - leave empty to run all similarity types, or set to: COS, L2, IP +SIMILARITY= + +# Notes: +# 1. Replace all placeholder values with your actual Azure resource information +# 2. For production, use Azure Key Vault or environment variables instead of storing keys in files +# 3. The EMBEDDING_DIMENSIONS must match your chosen embedding model: +# - text-embedding-3-small: 1536 dimensions +# - text-embedding-3-large: 3072 dimensions +# 4. Adjust batch sizes based on your API rate limits and performance requirements +# 5. For passwordless authentication, ensure your Azure identity has appropriate RBAC permissions diff --git a/ai/select-algorithm-go/.gitignore b/ai/select-algorithm-go/.gitignore new file mode 100644 index 0000000..76985d9 --- /dev/null +++ b/ai/select-algorithm-go/.gitignore @@ -0,0 +1,7 @@ +*.exe +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md +Hotels_Vector.json diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md new file mode 100644 index 0000000..e471530 --- /dev/null +++ b/ai/select-algorithm-go/README.md @@ -0,0 +1,199 @@ +# DocumentDB Vector Search - Go Algorithm Comparison Sample + +This sample demonstrates how to compare different vector search algorithms (IVF, HNSW, DiskANN) and similarity metrics (Cosine, L2, Inner Product) with Azure DocumentDB. + +## Prerequisites + +- [Go 1.24+](https://golang.org/dl/) +- [Azure DocumentDB cluster](/azure/documentdb/) (M40+ tier for DiskANN) +- [Azure OpenAI resource](https://learn.microsoft.com/azure/ai-services/openai/) with an embedding model deployed +- [Azure CLI](https://learn.microsoft.com/cli/azure/) (for passwordless authentication) +- Pre-generated embeddings file (`Hotels_Vector.json`) — see the `vector-search-go` sample + +## Setup + +1. **Clone the repository** and navigate to this directory: + + ```bash + cd ai/select-algorithm-go + ``` + +2. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + Required variables: + ```env + MONGO_CLUSTER_NAME=your-cluster-name + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_DOCUMENTDB_DATABASENAME=Hotels + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + ``` + +3. **Copy the shared data file** into this directory: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + + The `DATA_FILE_WITH_VECTORS` env var defaults to `../data/Hotels_Vector.json`. + +4. **Install dependencies**: + + ```bash + go mod download + ``` + +5. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +## Usage + +### Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single execution: + +```bash +go run ./src/... +``` + +This creates indexes sequentially (create/search/drop per combo — DocumentDB allows one vector index per kind per field) and prints a comparison table showing scores and top results. + +**Output:** +``` +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== + ... +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +ALGORITHM SIMILARITY #1 RESULT #1 SCORE #2 RESULT #2 SCORE DIFF +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +### On Windows (PowerShell) + +```powershell +go run ./src/... +``` + +## Environment Variables + +| Variable | Default | Description | +|--------------|----------------------------------|---------------------------------| +| `MONGO_CLUSTER_NAME` | *(required)* | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | *(required)* | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | `text-embedding-3-small` | Embedding model name | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Database name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to data file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Embedding vector dimensions | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data insertion | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query | +| `VERBOSE` | `false` | Show full results | + +## How It Works + +### Comparison Mode (`compare_all.go`) + +1. **Data Loading:** Loads hotel data with pre-generated embeddings +2. **Index Creation:** Creates vector indexes sequentially (one at a time): + - For each algorithm (IVF, HNSW, DiskANN) × each metric (COS, L2, IP): + - Create the index → wait for readiness → search → drop the index + - DocumentDB only allows one vector index per kind per field +3. **Query Execution:** Generates embedding once, reuses for all 9 searches +4. **Result Comparison:** Prints formatted table with #1/#2 results, scores, and diff + +## Index Parameters + +| Algorithm | Kind | Key Parameters | Values Used | +|-----------|-----------------|-----------------------------|-----------------------------| +| IVF | `vector-ivf` | `numLists` | 1 (optimized for small datasets) | +| HNSW | `vector-hnsw` | `m`, `efConstruction` | 16, 64 | +| DiskANN | `vector-diskann`| `maxDegree`, `lBuild` | 20, 10 | + +## Project Structure + +``` +select-algorithm-go/ +├── .env.example # Environment variable template +├── go.mod # Go module dependencies +├── go.sum # Go module checksums +├── output/ # Sample output files +├── README.md # This file +└── src/ + ├── main.go # Entry point + ├── utils.go # Shared config, auth, data, and search helpers + └── compare_all.go # Unified 9-combination comparison runner (create/search/drop) +``` + +## Authentication + +This sample uses **passwordless (OIDC) authentication** with `DefaultAzureCredential`. Ensure your Azure identity has: + +- **DocumentDB**: Appropriate RBAC role on the cluster +- **Azure OpenAI**: `Cognitive Services OpenAI User` role on the OpenAI resource + +The MongoDB OIDC auth uses the `https://ossrdbms-aad.database.windows.net/.default` scope, and the OpenAI client uses Azure token credentials. + +## Important Notes + +- **COS/IP scores:** Higher = more similar (0–1 range) +- **L2 scores:** Lower = more similar (distance metric) +- **Sequential indexing:** DocumentDB requires create/search/drop per combo (one vector index per kind per field) +- **Cleanup:** The sample automatically drops collections on exit +- **bson.D ordering:** All MongoDB commands use `bson.D` (ordered) instead of `bson.M` (unordered) to avoid "multi-key map" errors + +## Troubleshooting + +**"OIDC authentication failed"** +- Run `az login` and ensure you're authenticated +- Verify your Azure identity has RBAC permissions on the DocumentDB cluster +- Check that `MONGO_CLUSTER_NAME` matches your cluster name + +**"DiskANN indexes require a higher cluster tier"** +- DiskANN requires M40+ cluster tier +- Try IVF or HNSW instead, or upgrade your cluster + +**"No documents found with embeddings"** +- Ensure `DATA_FILE_WITH_VECTORS` points to the correct file +- Verify the file contains the field specified in `EMBEDDED_FIELD` +- Check that embeddings were generated with the correct dimensions + +## Learn More + +- [Azure DocumentDB Documentation](/azure/documentdb/) +- [Vector Search in DocumentDB](/azure/documentdb/vector-search) +- [Choosing a Vector Index Algorithm](/azure/documentdb/vector-search-algorithms) +- [Go MongoDB driver](https://pkg.go.dev/go.mongodb.org/mongo-driver) diff --git a/ai/select-algorithm-go/data/README.md b/ai/select-algorithm-go/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-go/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-go/go.mod b/ai/select-algorithm-go/go.mod new file mode 100644 index 0000000..f669ace --- /dev/null +++ b/ai/select-algorithm-go/go.mod @@ -0,0 +1,35 @@ +module documentdb-select-algorithm + +go 1.24.0 + +require ( + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 + github.com/openai/openai-go/v3 v3.12.0 + go.mongodb.org/mongo-driver v1.17.6 +) + +require ( + github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect + github.com/golang-jwt/jwt/v5 v5.3.0 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect + github.com/montanaflynn/stats v0.7.1 // indirect + github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.1.2 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect +) diff --git a/ai/select-algorithm-go/go.sum b/ai/select-algorithm-go/go.sum new file mode 100644 index 0000000..6263657 --- /dev/null +++ b/ai/select-algorithm-go/go.sum @@ -0,0 +1,95 @@ +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= +github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= +github.com/openai/openai-go/v3 v3.12.0 h1:NkrImaglFQeDycc/n/fEmpFV8kKr8snl9/8X2x4eHOg= +github.com/openai/openai-go/v3 v3.12.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.mongodb.org/mongo-driver v1.17.6 h1:87JUG1wZfWsr6rIz3ZmpH90rL5tea7O3IHuSwHUpsss= +go.mongodb.org/mongo-driver v1.17.6/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ai/select-algorithm-go/output/compare_all.txt b/ai/select-algorithm-go/output/compare_all.txt new file mode 100644 index 0000000..0eeb9a3 --- /dev/null +++ b/ai/select-algorithm-go/output/compare_all.txt @@ -0,0 +1,39 @@ +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== +Query: "luxury hotel near the beach" +Top-K: 5 + +Loading data from data/Hotels_Vector.json... +Loaded 50 documents with embeddings +Insertion completed: 50 inserted, 0 failed + +Generating embedding for query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running 9 vector index comparisons (create→search→drop)... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go new file mode 100644 index 0000000..3585085 --- /dev/null +++ b/ai/select-algorithm-go/src/compare_all.go @@ -0,0 +1,325 @@ +package main + +import ( + "context" + "fmt" + "math" + "strconv" + "strings" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CompareResult holds the result of a single algorithm+metric search +type CompareResult struct { + Algorithm string + Metric string + Results []SearchResult + Top1Name string + Top1Score float64 + Top2Name string + Top2Score float64 + Error error +} + +// indexSpec defines one of the 9 combinations +type indexSpec struct { + Algorithm string + Kind string + Metric string + IndexName string + Options bson.D +} + +// RunCompareAll executes all 9 algorithm×metric combinations on a single collection +func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + queryText := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + topK, _ := strconv.Atoi(getEnvOrDefault("TOP_K", "5")) + + fmt.Println("\n" + strings.Repeat("=", 70)) + fmt.Println(" COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations)") + fmt.Println(strings.Repeat("=", 70)) + fmt.Printf("Query: %q\n", queryText) + fmt.Printf("Top-K: %d\n", topK) + + // 1. Drop collection for clean comparison, then load data + database := dbClient.Database(config.DatabaseName) + collection := database.Collection("hotels") + + // Drop existing collection for a clean comparison + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("\nCleanup: dropping comparison collection...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels'") + } + }() + + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // 2. Generate ONE embedding for the query (reused for all 9 searches) + fmt.Printf("\nGenerating embedding for query: %q\n", queryText) + queryEmbedding, err := GenerateEmbedding(ctx, aiClient, queryText, config.ModelName) + if err != nil { + return fmt.Errorf("failed to generate query embedding: %v", err) + } + fmt.Printf("Embedding generated (%d dimensions)\n", len(queryEmbedding)) + + // 3. Define all 9 index specs + metrics := []string{"COS", "L2", "IP"} + specs := buildIndexSpecs(config.VectorField, config.Dimensions, metrics) + + // 4. Create→search→drop each index sequentially (DocumentDB only allows one vector index per field) + fmt.Printf("\nRunning %d vector index comparisons (create→search→drop)...\n", len(specs)) + var results []CompareResult + + for _, spec := range specs { + // Drop all existing vector indexes on this field + DropVectorIndexes(ctx, collection, config.VectorField) + + // Create this specific index with retry (drop may still be in progress) + var createErr error + for attempt := 0; attempt < 3; attempt++ { + if attempt > 0 { + time.Sleep(3 * time.Second) + } + createErr = createNamedVectorIndex(ctx, collection, config.VectorField, spec) + if createErr == nil { + break + } + } + if createErr != nil { + results = append(results, CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + Error: createErr, + }) + fmt.Printf(" ⚠ %s: %v\n", spec.IndexName, createErr) + continue + } + fmt.Printf(" ✓ %s created\n", spec.IndexName) + + // Wait for index to become ready + time.Sleep(10 * time.Second) + + // Search using simple cosmosSearch (with retry for index readiness) + var searchResults []SearchResult + var searchErr error + for searchAttempt := 0; searchAttempt < 3; searchAttempt++ { + if searchAttempt > 0 { + time.Sleep(5 * time.Second) + } + searchResults, searchErr = vectorSearchSimple(ctx, collection, queryEmbedding, config.VectorField, topK) + if searchErr == nil && len(searchResults) > 0 { + break + } + } + + top1Name, top1Score := extractResult(searchResults, 0) + top2Name, top2Score := extractResult(searchResults, 1) + + cr := CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + Results: searchResults, + Top1Name: top1Name, + Top1Score: top1Score, + Top2Name: top2Name, + Top2Score: top2Score, + Error: searchErr, + } + results = append(results, cr) + } + + // 6. Print comparison table + fmt.Println() + printComparisonTable(results) + + return nil +} + +// buildIndexSpecs creates the 9 index specifications +func buildIndexSpecs(vectorField string, dimensions int, metrics []string) []indexSpec { + var specs []indexSpec + + type algoConfig struct { + name string + kind string + options bson.D + } + + algos := []algoConfig{ + {"IVF", "vector-ivf", bson.D{{"numLists", 1}}}, + {"HNSW", "vector-hnsw", bson.D{{"m", 16}, {"efConstruction", 64}}}, + {"DiskANN", "vector-diskann", bson.D{{"maxDegree", 20}, {"lBuild", 10}}}, + } + + for _, algo := range algos { + for _, metric := range metrics { + metricLower := strings.ToLower(metric) + opts := bson.D{ + {"kind", algo.kind}, + {"dimensions", dimensions}, + {"similarity", metric}, + } + for _, o := range algo.options { + opts = append(opts, o) + } + + specs = append(specs, indexSpec{ + Algorithm: algo.name, + Kind: algo.kind, + Metric: metric, + IndexName: fmt.Sprintf("vector_%s_%s", strings.ToLower(algo.name), metricLower), + Options: opts, + }) + } + } + + return specs +} + +// createNamedVectorIndex creates a single named vector index +func createNamedVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, spec indexSpec) error { + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", spec.IndexName}, + {"key", bson.D{ + {vectorField, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", spec.Options}, + }, + }}, + } + + var result bson.M + err := collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + if strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "IndexAlreadyExists") { + return nil + } + return err + } + return nil +} + +// vectorSearchSimple performs a vector search using the active vector index +func vectorSearchSimple(ctx context.Context, collection *mongo.Collection, embedding []float64, vectorField string, topK int) ([]SearchResult, error) { + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": embedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, err + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, err + } + + return results, nil +} + +// extractResult returns the name and score of the result at the given index +func extractResult(results []SearchResult, idx int) (string, float64) { + if idx >= len(results) { + return "(no results)", 0 + } + doc := results[idx].Document.(bson.D) + var name string + for _, elem := range doc { + if elem.Key == "HotelName" { + name = fmt.Sprintf("%v", elem.Value) + break + } + } + if name == "" { + name = "Unknown" + } + return name, results[idx].Score +} + +// printComparisonTable outputs a formatted table of results +func printComparisonTable(results []CompareResult) { + fmt.Println("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐") + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %-6s │ %-26s │ %-6s │ %-5s │\n", + "Algorithm", "Metric", "Top 1 Result", "Score", "Top 2 Result", "Score", "Diff") + fmt.Println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤") + + for _, r := range results { + if r.Error != nil { + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %-6s │ %-26s │ %-6s │ %-5s │\n", + r.Algorithm, r.Metric, "ERROR", "-", "-", "-", "-") + continue + } + + top1 := r.Top1Name + if len(top1) > 26 { + top1 = top1[:26] + } + top2 := r.Top2Name + if len(top2) > 26 { + top2 = top2[:26] + } + diff := math.Abs(r.Top1Score - r.Top2Score) + + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %6.4f │ %-26s │ %6.4f │%6.4f │\n", + r.Algorithm, r.Metric, top1, r.Top1Score, top2, r.Top2Score, diff) + } + + fmt.Println("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘") +} diff --git a/ai/select-algorithm-go/src/main.go b/ai/select-algorithm-go/src/main.go new file mode 100644 index 0000000..85e7e6e --- /dev/null +++ b/ai/select-algorithm-go/src/main.go @@ -0,0 +1,28 @@ +package main + +import ( + "context" + "fmt" + "log" +) + +func main() { + fmt.Println("Starting vector algorithm comparison...") + + ctx := context.Background() + config := LoadConfig() + + fmt.Println("\nInitializing clients with passwordless authentication...") + mongoClient, azureOpenAIClient, err := GetClientsPasswordless(ctx, config) + if err != nil { + log.Fatalf("Failed to initialize clients: %v", err) + } + defer mongoClient.Disconnect(ctx) + + err = RunCompareAll(ctx, config, mongoClient, azureOpenAIClient) + if err != nil { + log.Fatalf("Compare all failed: %v", err) + } + + fmt.Println("\nComparison completed successfully!") +} diff --git a/ai/select-algorithm-go/src/utils.go b/ai/select-algorithm-go/src/utils.go new file mode 100644 index 0000000..8b415db --- /dev/null +++ b/ai/select-algorithm-go/src/utils.go @@ -0,0 +1,385 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strconv" + "strings" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/openai/openai-go/v3" + "github.com/openai/openai-go/v3/azure" + "github.com/openai/openai-go/v3/option" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" + "go.mongodb.org/mongo-driver/mongo/options" +) + +// Config holds the application configuration +type Config struct { + ClusterName string + DatabaseName string + DataFile string + VectorField string + ModelName string + Dimensions int + BatchSize int + Similarity string + Algorithm string +} + +// SearchResult represents a search result document +type SearchResult struct { + Document interface{} `bson:"document"` + Score float64 `bson:"score"` +} + +// InsertStats holds statistics about data insertion +type InsertStats struct { + Total int `json:"total"` + Inserted int `json:"inserted"` + Failed int `json:"failed"` +} + +// LoadConfig loads configuration from environment variables +func LoadConfig() *Config { + dimensions, _ := strconv.Atoi(getEnvOrDefault("EMBEDDING_DIMENSIONS", "1536")) + batchSize, _ := strconv.Atoi(getEnvOrDefault("LOAD_SIZE_BATCH", "100")) + + return &Config{ + ClusterName: getEnvOrDefault("MONGO_CLUSTER_NAME", ""), + DatabaseName: getEnvOrDefault("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"), + DataFile: getEnvOrDefault("DATA_FILE_WITH_VECTORS", "data/Hotels_Vector.json"), + VectorField: getEnvOrDefault("EMBEDDED_FIELD", "DescriptionVector"), + ModelName: getEnvOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"), + Dimensions: dimensions, + BatchSize: batchSize, + Similarity: getEnvOrDefault("SIMILARITY", ""), + Algorithm: strings.ToLower(getEnvOrDefault("ALGORITHM", "")), + } +} + +// getEnvOrDefault returns environment variable value or default if not set +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} + +// GetClientsPasswordless creates MongoDB and Azure OpenAI clients with passwordless authentication +func GetClientsPasswordless(ctx context.Context, config *Config) (*mongo.Client, openai.Client, error) { + if config.ClusterName == "" { + return nil, openai.Client{}, fmt.Errorf("MONGO_CLUSTER_NAME environment variable is required") + } + + // Create Azure credential + credential, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("failed to create Azure credential: %v", err) + } + + // Connect to DocumentDB with OIDC authentication + mongoURI := fmt.Sprintf("mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", config.ClusterName) + + fmt.Println("Attempting OIDC authentication...") + mongoClient, err := connectWithOIDC(ctx, mongoURI, credential) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("OIDC authentication failed: %v", err) + } + fmt.Println("OIDC authentication successful!") + + // Get Azure OpenAI endpoint + azureOpenAIEndpoint := os.Getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if azureOpenAIEndpoint == "" { + return nil, openai.Client{}, fmt.Errorf("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + } + + // Create Azure OpenAI client with credential-based authentication + openAIClient := openai.NewClient( + option.WithBaseURL(fmt.Sprintf("%s/openai/v1", azureOpenAIEndpoint)), + azure.WithTokenCredential(credential)) + + return mongoClient, openAIClient, nil +} + +// connectWithOIDC attempts to connect using OIDC authentication +func connectWithOIDC(ctx context.Context, mongoURI string, credential *azidentity.DefaultAzureCredential) (*mongo.Client, error) { + oidcCallback := func(ctx context.Context, args *options.OIDCArgs) (*options.OIDCCredential, error) { + scope := "https://ossrdbms-aad.database.windows.net/.default" + fmt.Printf("Getting token with scope: %s\n", scope) + token, err := credential.GetToken(ctx, policy.TokenRequestOptions{ + Scopes: []string{scope}, + }) + if err != nil { + return nil, fmt.Errorf("failed to get token with scope %s: %v", scope, err) + } + + fmt.Printf("Successfully obtained token\n") + + return &options.OIDCCredential{ + AccessToken: token.Token, + }, nil + } + + clientOptions := options.Client(). + ApplyURI(mongoURI). + SetConnectTimeout(30 * time.Second). + SetServerSelectionTimeout(30 * time.Second). + SetRetryWrites(false). + SetAuth(options.Credential{ + AuthMechanism: "MONGODB-OIDC", + AuthMechanismProperties: map[string]string{ + "TOKEN_RESOURCE": "https://ossrdbms-aad.database.windows.net", + }, + OIDCMachineCallback: oidcCallback, + }) + + mongoClient, err := mongo.Connect(ctx, clientOptions) + if err != nil { + return nil, err + } + + return mongoClient, nil +} + +// ReadFileReturnJSON reads a JSON file and returns the data as a slice of maps +func ReadFileReturnJSON(filePath string) ([]map[string]interface{}, error) { + file, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("error reading file '%s': %v", filePath, err) + } + + var data []map[string]interface{} + err = json.Unmarshal(file, &data) + if err != nil { + return nil, fmt.Errorf("error parsing JSON in file '%s': %v", filePath, err) + } + + return data, nil +} + +// InsertData inserts data into a MongoDB collection in batches +func InsertData(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + totalDocuments := len(data) + insertedCount := 0 + failedCount := 0 + + fmt.Printf("Starting batch insertion of %d documents...\n", totalDocuments) + + for i := 0; i < totalDocuments; i += batchSize { + end := i + batchSize + if end > totalDocuments { + end = totalDocuments + } + + batch := data[i:end] + batchNum := (i / batchSize) + 1 + + documents := make([]interface{}, len(batch)) + for j, doc := range batch { + documents[j] = doc + } + + result, err := collection.InsertMany(ctx, documents, options.InsertMany().SetOrdered(false)) + if err != nil { + if bulkErr, ok := err.(mongo.BulkWriteException); ok { + errorCount := len(bulkErr.WriteErrors) + insertedCount += len(batch) - errorCount + failedCount += errorCount + fmt.Printf("Batch %d had errors: %d inserted, %d failed\n", batchNum, len(batch)-errorCount, errorCount) + for _, writeErr := range bulkErr.WriteErrors { + fmt.Printf(" Error: %s\n", writeErr.Message) + } + } else { + failedCount += len(batch) + fmt.Printf("Batch %d failed completely: %v\n", batchNum, err) + } + } else { + insertedCount += len(result.InsertedIDs) + fmt.Printf("Batch %d completed: %d documents inserted\n", batchNum, len(result.InsertedIDs)) + } + + time.Sleep(100 * time.Millisecond) + } + + return &InsertStats{ + Total: totalDocuments, + Inserted: insertedCount, + Failed: failedCount, + }, nil +} + +// DropVectorIndexes drops existing vector indexes on the specified field +func DropVectorIndexes(ctx context.Context, collection *mongo.Collection, vectorField string) error { + cursor, err := collection.Indexes().List(ctx) + if err != nil { + return fmt.Errorf("could not list indexes: %v", err) + } + defer cursor.Close(ctx) + + var vectorIndexes []string + for cursor.Next(ctx) { + var index bson.M + if err := cursor.Decode(&index); err != nil { + continue + } + + if key, ok := index["key"].(bson.M); ok { + if indexType, exists := key[vectorField]; exists && indexType == "cosmosSearch" { + if name, ok := index["name"].(string); ok { + vectorIndexes = append(vectorIndexes, name) + } + } + } + } + + for _, indexName := range vectorIndexes { + fmt.Printf("Dropping existing vector index: %s\n", indexName) + _, err := collection.Indexes().DropOne(ctx, indexName) + if err != nil { + fmt.Printf("Warning: Could not drop index %s: %v\n", indexName, err) + } + } + + if len(vectorIndexes) > 0 { + fmt.Printf("Dropped %d existing vector index(es)\n", len(vectorIndexes)) + } else { + fmt.Println("No existing vector indexes found to drop") + } + + return nil +} + +// PerformVectorSearch performs a vector search using the cosmosSearch aggregation pipeline +func PerformVectorSearch(ctx context.Context, collection *mongo.Collection, client openai.Client, query, vectorField, model string, topK int) ([]SearchResult, error) { + fmt.Printf("Performing vector search for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, client, query, model) + if err != nil { + return nil, fmt.Errorf("error generating embedding: %v", err) + } + + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": queryEmbedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, fmt.Errorf("error performing vector search: %v", err) + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + fmt.Printf("Warning: Could not decode result: %v\n", err) + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, fmt.Errorf("cursor error: %v", err) + } + + return results, nil +} + +// GenerateEmbedding generates an embedding for the given text using Azure OpenAI +func GenerateEmbedding(ctx context.Context, client openai.Client, text, modelName string) ([]float64, error) { + resp, err := client.Embeddings.New(ctx, openai.EmbeddingNewParams{ + Input: openai.EmbeddingNewParamsInputUnion{ + OfString: openai.String(text), + }, + Model: modelName, + }) + if err != nil { + return nil, fmt.Errorf("failed to generate embedding: %v", err) + } + + if len(resp.Data) == 0 { + return nil, fmt.Errorf("no embedding data received") + } + + embedding := make([]float64, len(resp.Data[0].Embedding)) + for i, v := range resp.Data[0].Embedding { + embedding[i] = float64(v) + } + + return embedding, nil +} + +// PrintSearchResults prints search results in a formatted way +func PrintSearchResults(results []SearchResult, algorithm string) { + if len(results) == 0 { + fmt.Println("No search results found.") + return + } + + fmt.Printf("\n%s Search Results (top %d):\n", strings.ToUpper(algorithm), len(results)) + fmt.Println(strings.Repeat("=", 80)) + + for i, result := range results { + doc := result.Document.(bson.D) + var hotelName string + for _, elem := range doc { + if elem.Key == "HotelName" { + hotelName = fmt.Sprintf("%v", elem.Value) + break + } + } + + fmt.Printf("%d. HotelName: %s, Score: %.4f\n", i+1, hotelName, result.Score) + } +} + +// FilterDocumentsWithEmbeddings returns only documents that contain the vector field +func FilterDocumentsWithEmbeddings(data []map[string]interface{}, vectorField string) []map[string]interface{} { + var filtered []map[string]interface{} + for _, doc := range data { + if _, exists := doc[vectorField]; exists { + filtered = append(filtered, doc) + } + } + return filtered +} + +// PrepareCollection clears existing data and inserts new documents +func PrepareCollection(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + fmt.Printf("Preparing collection '%s'...\n", collection.Name()) + + deleteResult, err := collection.DeleteMany(ctx, bson.M{}) + if err != nil { + return nil, fmt.Errorf("failed to clear existing data: %v", err) + } + if deleteResult.DeletedCount > 0 { + fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) + } + + stats, err := InsertData(ctx, collection, data, batchSize) + if err != nil { + return nil, fmt.Errorf("failed to insert data: %v", err) + } + + return stats, nil +} diff --git a/ai/select-algorithm-java/.env.example b/ai/select-algorithm-java/.env.example new file mode 100644 index 0000000..59637e5 --- /dev/null +++ b/ai/select-algorithm-java/.env.example @@ -0,0 +1,32 @@ +# Azure DocumentDB cluster name (find in Azure Portal > DocumentDB > Overview) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI embedding endpoint (find in Azure Portal > Azure OpenAI > Keys and Endpoint) +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model deployment name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Path to pre-computed vectors JSON file +DATA_FILE_WITH_VECTORS=./Hotels_Vector.json + +# Database name (default: Hotels) +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Field name containing embeddings in the data file +EMBEDDED_FIELD=DescriptionVector + +# Embedding dimensions (default: 1536) +EMBEDDING_DIMENSIONS=1536 + +# Batch size for loading data (default: 100) +LOAD_SIZE_BATCH=100 + +# Batch size for embedding requests (default: 16) +EMBEDDING_SIZE_BATCH=16 + +# Algorithm to test: leave empty to run all, or set to: diskann, hnsw, ivf +ALGORITHM= + +# SIMILARITY - leave empty to run all similarity types, or set to: COS, L2, IP +SIMILARITY= diff --git a/ai/select-algorithm-java/.gitignore b/ai/select-algorithm-java/.gitignore new file mode 100644 index 0000000..9ae5e73 --- /dev/null +++ b/ai/select-algorithm-java/.gitignore @@ -0,0 +1,7 @@ +target/ +.env +*.class + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md new file mode 100644 index 0000000..a94b022 --- /dev/null +++ b/ai/select-algorithm-java/README.md @@ -0,0 +1,128 @@ +# Select Algorithm - Java + +This sample demonstrates how to compare all three vector search index algorithms (IVF, HNSW, DiskANN) with Azure DocumentDB using the MongoDB Java driver. + +## Prerequisites + +- Java 17 or later +- Maven 3.8+ +- Azure DocumentDB cluster with vector search enabled +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +2. Update `.env` with your Azure resource details (if not using `azd`): + - `MONGO_CLUSTER_NAME` — your DocumentDB cluster name + - `AZURE_OPENAI_EMBEDDING_ENDPOINT` — your Azure OpenAI endpoint + - `AZURE_OPENAI_EMBEDDING_MODEL` — deployment name (e.g., `text-embedding-3-small`) + - `DATA_FILE_WITH_VECTORS` — path to the pre-computed vectors JSON file + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +## Build + +```bash +mvn clean compile +``` + +## Run + +Compare all 9 algorithm × similarity combinations: + +```bash +mvn exec:java -Pcompare +``` + +Or via the `ALGORITHM` environment variable: + +```bash +ALGORITHM=compare mvn exec:java +``` + +On Windows (PowerShell): + +```powershell +$env:ALGORITHM="compare"; mvn exec:java +``` + +## Algorithms + +| Algorithm | Description | Best For | +|-----------|-------------|----------| +| **IVF** | Inverted File index — partitions vectors into clusters | Large datasets with batch queries | +| **HNSW** | Hierarchical Navigable Small World graph | Low-latency, high-recall searches | +| **DiskANN** | Disk-based Approximate Nearest Neighbor | Very large datasets that exceed memory | + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `MONGO_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field name containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data loading | +| `EMBEDDING_SIZE_BATCH` | `16` | Batch size for embedding requests | +| `ALGORITHM` | (empty = all) | Which algorithm to run | +| `SIMILARITY` | (empty = all) | Similarity metric: `COS`, `L2`, `IP` | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Print detailed per-index results | + +## Authentication + +This sample uses **passwordless authentication** via `DefaultAzureCredential`: + +- **DocumentDB**: OIDC mechanism with Azure identity +- **Azure OpenAI**: Entra ID token-based auth + +Ensure your identity has the appropriate RBAC roles assigned on both resources. + +### What It Does + +1. Connects to DocumentDB and loads hotel data into a single `hotels` collection +2. Generates one embedding for the query text (reused for all searches) +3. For each of the 9 algorithm/metric combinations: creates the index → searches → drops the index +4. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +5. Prints a formatted comparison table with scores, top results, and key insights + +### Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=20, lBuild=10 | + +## Project Structure + +``` +src/main/java/com/azure/documentdb/selectalgorithm/ +├── Main.java — Entry point, runs CompareAll +├── Utils.java — Shared helpers (connection, embedding, data loading) +└── CompareAll.java — Unified comparison runner (all 9 combinations) +``` diff --git a/ai/select-algorithm-java/data/README.md b/ai/select-algorithm-java/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-java/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-java/output/compare_all.txt b/ai/select-algorithm-java/output/compare_all.txt new file mode 100644 index 0000000..7794fd5 --- /dev/null +++ b/ai/select-algorithm-java/output/compare_all.txt @@ -0,0 +1,50 @@ +============================================== + Azure DocumentDB - Compare All Algorithms +============================================== + Query: "luxury hotel near the beach" + Top K: 5 + Metrics: COS, L2, IP + Algos: IVF, HNSW, DiskANN + + Loading data from: data/Hotels_Vector.json + Loaded 50 documents + Collection reset. + + Generating embedding for: "luxury hotel near the beach" + Embedding generated (1536 dimensions) + + Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ diff --git a/ai/select-algorithm-java/pom.xml b/ai/select-algorithm-java/pom.xml new file mode 100644 index 0000000..99c57e9 --- /dev/null +++ b/ai/select-algorithm-java/pom.xml @@ -0,0 +1,79 @@ + + + 4.0.0 + + com.azure.documentdb + select-algorithm-java + 1.0.0 + jar + + DocumentDB Select Algorithm - Java + Demonstrates IVF, HNSW, and DiskANN vector search indexes with Azure DocumentDB + + + 17 + 17 + UTF-8 + + + + + org.mongodb + mongodb-driver-sync + 5.4.0 + + + com.azure + azure-identity + 1.16.0 + + + com.azure + azure-ai-openai + 1.0.0-beta.16 + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.Main + + + + + + + + compare + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.CompareAll + + + + + + + diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java new file mode 100644 index 0000000..942dc69 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -0,0 +1,223 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * Unified comparison runner that executes all 9 combinations + * (3 algorithms x 3 similarity metrics) and prints a formatted table. + */ +public class CompareAll { + + private static final String COLLECTION_NAME = "hotels"; + private static final String[] ALGORITHMS = {"ivf", "hnsw", "diskann"}; + private static final String[] METRICS = {"COS", "L2", "IP"}; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String queryText = Utils.getEnv("QUERY_TEXT", "luxury hotel near the beach"); + int topK = Integer.parseInt(Utils.getEnv("TOP_K", "5")); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "DescriptionVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Top K: %d%n", topK); + System.out.printf(" Metrics: COS, L2, IP%n"); + System.out.printf(" Algos: IVF, HNSW, DiskANN%n"); + System.out.println(); + + List results = new ArrayList<>(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + collection.drop(); + System.out.println(" Collection reset."); + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Run 9 algorithm × metric combinations sequentially (create→search→drop) + // DocumentDB does not allow multiple vector indexes of the same kind + // on the same field path simultaneously. + System.out.println(" Running 9 algorithm × metric combinations...\n"); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + // 1. Drop all existing vector indexes + dropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + createIndex(database, collection, vectorField, dimensions, algo, metric); + System.out.printf(" ✓ %s created%n", indexName); + + // 3. Wait for index to build + try { Thread.sleep(5000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } + + // 4. Search + List searchResults = performSearch( + collection, vectorAsDoubles, vectorField, topK); + + // 5. Extract top 2 results + String top1Name = "-"; double top1Score = 0.0; + String top2Name = "-"; double top2Score = 0.0; + if (!searchResults.isEmpty()) { + Document top1 = searchResults.get(0); + top1Name = top1.getString("HotelName") != null ? top1.getString("HotelName") : "-"; + top1Score = top1.getDouble("score") != null ? top1.getDouble("score") : 0.0; + } + if (searchResults.size() > 1) { + Document top2 = searchResults.get(1); + top2Name = top2.getString("HotelName") != null ? top2.getString("HotelName") : "-"; + top2Score = top2.getDouble("score") != null ? top2.getDouble("score") : 0.0; + } + results.add(new SearchResult(algo.toUpperCase(), metric, top1Name, top1Score, top2Name, top2Score)); + } + } + + // Cleanup: drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); + } + + // Print comparison table + printComparisonTable(results); + } + + private static void dropVectorIndexes(MongoCollection collection, String vectorField) { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + Document key = idx.get("key", Document.class); + if (key != null && "cosmosSearch".equals(key.getString(vectorField))) { + try { + collection.dropIndex(name); + } catch (Exception e) { + // Ignore if index doesn't exist + } + } + } + } + + private static void createIndex(MongoDatabase database, MongoCollection collection, + String vectorField, int dimensions, + String algo, String metric) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + Document cosmosSearchOptions = new Document() + .append("dimensions", dimensions) + .append("similarity", metric); + + switch (algo) { + case "ivf" -> cosmosSearchOptions + .append("kind", "vector-ivf") + .append("numLists", 1); + case "hnsw" -> cosmosSearchOptions + .append("kind", "vector-hnsw") + .append("m", 16) + .append("efConstruction", 64); + case "diskann" -> cosmosSearchOptions + .append("kind", "vector-diskann") + .append("maxDegree", 20) + .append("lBuild", 10); + } + + Document indexDefinition = new Document() + .append("name", indexName) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + try { + database.runCommand(command); + } catch (Exception e) { + // Idempotent: ignore if index already exists + if (!e.getMessage().contains("already exists")) { + throw e; + } + } + } + + private static List performSearch(MongoCollection collection, + List vectorAsDoubles, + String vectorField, int topK) { + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + return results; + } + + private static void printComparisonTable(List results) { + System.out.println("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐"); + System.out.printf("│ %-9s│ %-7s│ %-27s│ %-7s│ %-27s│ %-7s│ %-6s│%n", + "Algorithm", "Metric", "Top 1 Result", "Score", "Top 2 Result", "Score", "Diff"); + System.out.println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + + for (int i = 0; i < results.size(); i++) { + SearchResult r = results.get(i); + double diff = Math.abs(r.top1Score() - r.top2Score()); + String top1Display = r.top1Name().length() > 27 ? r.top1Name().substring(0, 24) + "..." : r.top1Name(); + String top2Display = r.top2Name().length() > 27 ? r.top2Name().substring(0, 24) + "..." : r.top2Name(); + System.out.printf("│ %-9s│ %-7s│ %-27s│ %-7.4f│ %-27s│ %-7.4f│ %-6.4f│%n", + r.algorithm(), r.metric(), top1Display, r.top1Score(), top2Display, r.top2Score(), diff); + if (i < results.size() - 1) { + System.out.println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + } + } + System.out.println("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘"); + } + + private record SearchResult( + String algorithm, + String metric, + String top1Name, + double top1Score, + String top2Name, + double top2Score) { + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java new file mode 100644 index 0000000..5a9d54c --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java @@ -0,0 +1,17 @@ +package com.azure.documentdb.selectalgorithm; + +public class Main { + + public static void main(String[] args) { + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.println(); + + CompareAll.run(); + + System.out.println("=============================================="); + System.out.println(" Comparison complete."); + System.out.println("=============================================="); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java new file mode 100644 index 0000000..8ed19d0 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java @@ -0,0 +1,190 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingItem; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.core.credential.AccessToken; +import com.azure.identity.DefaultAzureCredential; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.model.InsertManyOptions; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class Utils { + + public static String getEnv(String key, String defaultValue) { + String value = System.getenv(key); + return (value != null && !value.isBlank()) ? value : defaultValue; + } + + public static String getEnv(String key) { + return getEnv(key, null); + } + + public static MongoClient getMongoClient() { + String clusterName = getEnv("MONGO_CLUSTER_NAME"); + if (clusterName == null) { + throw new IllegalStateException("MONGO_CLUSTER_NAME environment variable is required"); + } + + String connectionUri = String.format( + "mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", clusterName); + + // Use custom OIDC callback with DefaultAzureCredential + // This chains through CLI, managed identity, etc. + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + String tokenResource = "https://ossrdbms-aad.database.windows.net/.default"; + + MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", (MongoCredential.OidcCallback) context -> { + AccessToken token = credential.getToken( + new com.azure.core.credential.TokenRequestContext() + .addScopes(tokenResource)).block(); + return new MongoCredential.OidcCallbackResult(token.getToken()); + }); + + MongoClientSettings settings = MongoClientSettings.builder() + .applyConnectionString(new ConnectionString(connectionUri)) + .credential(mongoCredential) + .retryWrites(false) + .build(); + + return MongoClients.create(settings); + } + + public static OpenAIClient getOpenAIClient() { + String endpoint = getEnv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + if (endpoint == null) { + throw new IllegalStateException("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required"); + } + + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + public static List readJsonFile(String path) { + try { + String content = Files.readString(Path.of(path)); + // Parse JSON array of documents + @SuppressWarnings("unchecked") + List docs = Document.parse("{\"data\":" + content + "}").getList("data", Document.class); + return docs; + } catch (IOException e) { + throw new RuntimeException("Failed to read data file: " + path, e); + } + } + + public static void insertData(MongoCollection collection, List data, int batchSize) { + System.out.printf(" Inserting %d documents in batches of %d...%n", data.size(), batchSize); + InsertManyOptions options = new InsertManyOptions().ordered(false); + + for (int i = 0; i < data.size(); i += batchSize) { + List batch = data.subList(i, Math.min(i + batchSize, data.size())); + // Remove _id to avoid duplicate key errors on re-run + List cleaned = new ArrayList<>(); + for (Document doc : batch) { + Document copy = new Document(doc); + copy.remove("_id"); + cleaned.add(copy); + } + try { + collection.insertMany(cleaned, options); + } catch (Exception e) { + // Ignore duplicate key errors on re-insert + if (!e.getMessage().contains("duplicate key")) { + throw e; + } + } + System.out.printf(" Inserted batch %d-%d%n", i + 1, Math.min(i + batchSize, data.size())); + } + System.out.println(" Data insertion complete."); + } + + public static void dropVectorIndexes(MongoCollection collection, String vectorField) { + try { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + if (name != null && name.contains(vectorField) && !name.equals("_id_")) { + System.out.printf(" Dropping existing index: %s%n", name); + collection.dropIndex(name); + } + } + } catch (Exception e) { + // Ignore errors when indexes don't exist + System.out.println(" No existing vector indexes to drop."); + } + } + + public static List getEmbedding(OpenAIClient client, String text, String model) { + EmbeddingsOptions options = new EmbeddingsOptions(List.of(text)); + List embeddings = client.getEmbeddings(model, options).getData(); + if (embeddings.isEmpty()) { + throw new RuntimeException("No embedding returned for query text"); + } + return embeddings.get(0).getEmbedding(); + } + + public static List performVectorSearch( + MongoCollection collection, + OpenAIClient aiClient, + String query, + String vectorField, + String model, + int topK) { + + System.out.printf(" Generating embedding for query: \"%s\"%n", query); + List queryVector = getEmbedding(aiClient, query, model); + System.out.printf(" Embedding generated (%d dimensions)%n", queryVector.size()); + + // Convert List to List for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + + return results; + } + + public static void printResults(List results) { + System.out.println("\n === Search Results ==="); + for (int i = 0; i < results.size(); i++) { + Document doc = results.get(i); + System.out.printf(" %d. %s (score: %.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + System.out.printf(" %s%n", doc.getString("Description")); + } + System.out.println(); + } +} diff --git a/ai/select-algorithm-python/.env.example b/ai/select-algorithm-python/.env.example new file mode 100644 index 0000000..4a2baf8 --- /dev/null +++ b/ai/select-algorithm-python/.env.example @@ -0,0 +1,32 @@ +# Azure DocumentDB cluster name (find in Azure Portal > DocumentDB > Overview) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI embedding endpoint (find in Azure Portal > Azure OpenAI > Keys and Endpoint) +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model deployment name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Path to pre-computed vectors JSON file +DATA_FILE_WITH_VECTORS=./Hotels_Vector.json + +# Database name (default: Hotels) +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Field name containing embeddings in the data file +EMBEDDED_FIELD=DescriptionVector + +# Embedding dimensions (default: 1536) +EMBEDDING_DIMENSIONS=1536 + +# Batch size for loading data (default: 100) +LOAD_SIZE_BATCH=100 + +# Batch size for embedding requests (default: 16) +EMBEDDING_SIZE_BATCH=16 + +# Algorithm to test: diskann, hnsw, ivf (leave empty to run compare_all.py instead) +ALGORITHM= + +# SIMILARITY - leave empty to run all similarity types, or set to: COS, L2, IP +SIMILARITY= \ No newline at end of file diff --git a/ai/select-algorithm-python/.gitignore b/ai/select-algorithm-python/.gitignore new file mode 100644 index 0000000..87965ce --- /dev/null +++ b/ai/select-algorithm-python/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.pyc +.env +.venv/ + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md new file mode 100644 index 0000000..3393ce5 --- /dev/null +++ b/ai/select-algorithm-python/README.md @@ -0,0 +1,96 @@ + +# Select Vector Algorithm (Python) + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB. Each algorithm is optimized for different dataset sizes and performance requirements. + +## Algorithm Selection Guide + +| Algorithm | Dataset Size | Cluster Tier | Key Parameters | +|-----------|-------------|--------------|----------------| +| IVF | < 10K docs | M10+ | numLists | +| HNSW | 10K-50K | M30+ | m, efConstruction | +| DiskANN | 50K+ | M40+ | maxDegree, lBuild | + +## Prerequisites + +- Azure subscription +- Azure DocumentDB cluster (M40+ for all algorithms, M10+ for IVF only) +- Azure OpenAI resource with `text-embedding-3-small` deployed +- Python 3.10+ +- Azure CLI (`az login` for passwordless auth) + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +2. Install dependencies: + ```bash + cd src + pip install -r ../requirements.txt + ``` + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +4. Ensure you're logged in to Azure: + ```bash + az login + ``` + +## Run + +Compare all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation: + +```bash +cd src +python compare_all.py +``` + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `MONGO_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field name containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data loading | +| `EMBEDDING_SIZE_BATCH` | `16` | Batch size for embedding requests | +| `ALGORITHM` | (empty = all) | Which algorithm to run | +| `SIMILARITY` | (empty = all) | Similarity metric: `COS`, `L2`, `IP` | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Show all k results per combo | diff --git a/ai/select-algorithm-python/data/README.md b/ai/select-algorithm-python/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-python/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-python/output/compare_all.txt b/ai/select-algorithm-python/output/compare_all.txt new file mode 100644 index 0000000..aa96c4f --- /dev/null +++ b/ai/select-algorithm-python/output/compare_all.txt @@ -0,0 +1,47 @@ +====================================================================== + Compare All Algorithms — 9 Combinations + (3 Algorithms × 3 Similarity Metrics) +====================================================================== + + Query: "luxury hotel near the beach" + Top K: 5 + +Dropped existing 'hotels' collection (if any) +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Generating embedding for query... +Running 9 vector searches... + + Created index 'vector_ivf_cos' + Created index 'vector_ivf_l2' + Created index 'vector_ivf_ip' + Created index 'vector_hnsw_cos' + Created index 'vector_hnsw_l2' + Created index 'vector_hnsw_ip' + Created index 'vector_diskann_cos' + Created index 'vector_diskann_l2' + Created index 'vector_diskann_ip' ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| Algorithm | Metric | Top 1 Result | Score | Top 2 Result | Score | Diff | ++=============+==========+==========================+=========+===================+=========+========+ +| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-python/requirements.txt b/ai/select-algorithm-python/requirements.txt new file mode 100644 index 0000000..36e664e --- /dev/null +++ b/ai/select-algorithm-python/requirements.txt @@ -0,0 +1,11 @@ +# MongoDB driver for connecting to DocumentDB +pymongo>=4.7.0 + +# Azure OpenAI SDK for generating embeddings +openai>=1.0.0,<2.0.0 + +# Azure authentication library for passwordless connection +azure-identity>=1.15.0 + +# Formatted table output for compare_all.py +tabulate>=0.9.0 diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py new file mode 100644 index 0000000..ad62cee --- /dev/null +++ b/ai/select-algorithm-python/src/compare_all.py @@ -0,0 +1,206 @@ +""" +Compare All Algorithms — Unified comparison runner. + +Executes all 9 combinations (3 algorithms × 3 similarity metrics) in a single +invocation and prints a formatted comparison table. + +Algorithms: IVF, HNSW, DiskANN +Metrics: COS, L2, IP +""" +import os +import time +from typing import Dict, List, Any + +from tabulate import tabulate +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data +) + +# Index definitions: (algo_label, kind, extra_params) +ALGORITHMS = [ + ("IVF", "vector-ivf", {"numLists": 1}), + ("HNSW", "vector-hnsw", {"m": 16, "efConstruction": 64}), + ("DiskANN", "vector-diskann", {"maxDegree": 20, "lBuild": 10}), +] + +METRICS = ["COS", "L2", "IP"] + + +def get_compare_config() -> Dict[str, Any]: + """Load comparison-specific configuration from environment variables.""" + config = get_config() + config["query_text"] = os.getenv("QUERY_TEXT", "luxury hotel near the beach") + config["top_k"] = int(os.getenv("TOP_K", "5")) + return config + + +def index_name(algo: str, metric: str) -> str: + """Generate canonical index name: vector_{algo}_{metric}.""" + return f"vector_{algo.lower()}_{metric.lower()}" + + +def get_existing_index_names(collection) -> List[str]: + """Return names of existing indexes on the collection.""" + return [idx["name"] for idx in collection.list_indexes()] + + +def drop_vector_indexes(collection, vector_field: str) -> None: + """Drop all existing vector indexes on *vector_field*.""" + for idx in collection.list_indexes(): + name = idx.get("name", "") + key = idx.get("key", {}) + if vector_field in key and key[vector_field] == "cosmosSearch": + collection.drop_index(name) + + +def create_vector_index(collection, name: str, kind: str, vector_field: str, + dimensions: int, similarity: str, + extra_params: Dict[str, Any]) -> None: + """Create a single vector index.""" + cosmos_options = { + "kind": kind, + "dimensions": dimensions, + "similarity": similarity, + **extra_params, + } + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": name, + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": cosmos_options, + } + ], + } + collection.database.command(index_command) + + +def generate_embedding(azure_openai_client, query_text: str, + model_name: str) -> List[float]: + """Generate a single embedding for the query text.""" + response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + return response.data[0].embedding + + +def vector_search_with_index(collection, query_embedding: List[float], + vector_field: str, + top_k: int) -> List[Dict[str, Any]]: + """Run vector search using the single active index and return results.""" + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + results = list(collection.aggregate(pipeline)) + + return results + + +def main(): + print("=" * 70) + print(" Compare All Algorithms — 9 Combinations") + print(" (3 Algorithms × 3 Similarity Metrics)") + print("=" * 70) + + config = get_compare_config() + query_text = config["query_text"] + top_k = config["top_k"] + + print(f"\n Query: \"{query_text}\"") + print(f" Top K: {top_k}\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config["database_name"]] + + # Drop collection for a clean comparison + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection (if any)") + + # Create fresh collection and load data + collection = database["hotels"] + data = read_file_return_json(config["data_file"]) + documents = [doc for doc in data if config["vector_field"] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + insert_data(collection, documents, config["batch_size"]) + + # Generate ONE embedding for the query + print("\nGenerating embedding for query...") + query_embedding = generate_embedding( + azure_openai_client, query_text, config["model_name"] + ) + + # Run all 9 searches sequentially (create→search→drop for each) + print("Running 9 vector searches...\n") + table_rows = [] + + for algo_label, kind, extra_params in ALGORITHMS: + for metric in METRICS: + name = index_name(algo_label, metric) + # Drop all vector indexes first + drop_vector_indexes(collection, config["vector_field"]) + # Create this specific index + create_vector_index( + collection, name, kind, config["vector_field"], + config["dimensions"], metric, extra_params + ) + print(f" Created index '{name}'") + time.sleep(5) # Increased wait time + # Search (no index name needed) + results = vector_search_with_index( + collection, query_embedding, config["vector_field"], top_k + ) + + top1_name = results[0].get("document", results[0]).get("HotelName", "Unknown") if len(results) > 0 else "(no results)" + top1_score = results[0].get("score", 0) if len(results) > 0 else 0 + top2_name = results[1].get("document", results[1]).get("HotelName", "Unknown") if len(results) > 1 else "(no results)" + top2_score = results[1].get("score", 0) if len(results) > 1 else 0 + + table_rows.append([ + algo_label, + metric, + top1_name, + f"{top1_score:.4f}", + top2_name, + f"{top2_score:.4f}", + f"{abs(top1_score - top2_score):.4f}", + ]) + + # Print comparison table + headers = ["Algorithm", "Metric", "Top 1 Result", "Score", + "Top 2 Result", "Score", "Diff"] + print(tabulate(table_rows, headers=headers, tablefmt="grid")) + + finally: + # Cleanup: drop the comparison collection + try: + database = mongo_client[config["database_name"]] + database.drop_collection("hotels") + print("\nCleanup: dropped collection 'hotels'") + except Exception as e: + print(f"Cleanup warning: {e}") + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py new file mode 100644 index 0000000..ee905f7 --- /dev/null +++ b/ai/select-algorithm-python/src/utils.py @@ -0,0 +1,171 @@ +import json +import os +import time +import warnings +from typing import Dict, List, Any, Optional, Tuple + +# Suppress the PyMongo CosmosDB cluster detection warning +warnings.filterwarnings( + "ignore", + message="You appear to be connected to a CosmosDB cluster.*", +) + +from pymongo import MongoClient, InsertOne +from pymongo.collection import Collection +from pymongo.errors import BulkWriteError +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult +from openai import AzureOpenAI + + +class AzureIdentityTokenCallback(OIDCCallback): + def __init__(self, credential): + self.credential = credential + + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + token = self.credential.get_token( + "https://ossrdbms-aad.database.windows.net/.default").token + return OIDCCallbackResult(access_token=token) + + +def get_clients_passwordless() -> Tuple[MongoClient, AzureOpenAI]: + """Create MongoDB and Azure OpenAI clients using passwordless auth.""" + cluster_name = os.getenv("MONGO_CLUSTER_NAME") + if not cluster_name: + raise ValueError("MONGO_CLUSTER_NAME environment variable is required") + + credential = DefaultAzureCredential() + + mongo_client = MongoClient( + f"mongodb+srv://{cluster_name}.global.mongocluster.cosmos.azure.com/", + connectTimeoutMS=120000, + tls=True, + retryWrites=False, + authMechanism="MONGODB-OIDC", + authMechanismProperties={"OIDC_CALLBACK": AzureIdentityTokenCallback(credential)} + ) + + azure_openai_endpoint = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if not azure_openai_endpoint: + raise ValueError("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + + token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default") + + azure_openai_client = AzureOpenAI( + azure_endpoint=azure_openai_endpoint, + azure_ad_token_provider=token_provider, + api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION", "2023-05-15") + ) + + return mongo_client, azure_openai_client + + +def get_config() -> Dict[str, Any]: + """Load configuration from environment variables.""" + return { + 'database_name': os.getenv('AZURE_DOCUMENTDB_DATABASENAME', 'Hotels'), + 'data_file': os.getenv('DATA_FILE_WITH_VECTORS', 'data/Hotels_Vector.json'), + 'vector_field': os.getenv('EMBEDDED_FIELD', 'DescriptionVector'), + 'model_name': os.getenv('AZURE_OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'dimensions': int(os.getenv('EMBEDDING_DIMENSIONS', '1536')), + 'batch_size': int(os.getenv('LOAD_SIZE_BATCH', '100')), + 'similarity': os.getenv('SIMILARITY', ''), + } + + +def read_file_return_json(file_path: str) -> List[Dict[str, Any]]: + """Read a JSON file and return the parsed data.""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + return json.load(file) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found") + raise + + +def insert_data(collection: Collection, data: List[Dict[str, Any]], + batch_size: int = 100) -> Dict[str, Any]: + """Insert data into collection in batches, skipping if already populated.""" + total_documents = len(data) + + existing_count = collection.count_documents({}) + if existing_count >= total_documents: + print(f"Collection already has {existing_count} documents, skipping insert") + return {'total': total_documents, 'inserted': 0, 'skipped': True} + + if existing_count > 0: + collection.delete_many({}) + + inserted_count = 0 + for i in range(0, total_documents, batch_size): + batch = data[i:i + batch_size] + try: + operations = [InsertOne(doc) for doc in batch] + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count + except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + time.sleep(0.1) + + print(f"Inserted {inserted_count}/{total_documents} documents") + return {'total': total_documents, 'inserted': inserted_count, 'skipped': False} + + +def drop_vector_indexes(collection: Collection, vector_field: str) -> None: + """Drop any existing vector indexes on the specified field.""" + try: + indexes = list(collection.list_indexes()) + for index in indexes: + if 'key' in index and vector_field in index['key']: + if index['key'][vector_field] == 'cosmosSearch': + collection.drop_index(index['name']) + print(f"Dropped existing vector index: {index['name']}") + except Exception as e: + print(f"Warning: Error dropping indexes: {e}") + + +def perform_vector_search(collection: Collection, + azure_openai_client: AzureOpenAI, + query_text: str, + vector_field: str, + model_name: str, + top_k: int = 5) -> List[Dict[str, Any]]: + """Perform vector search using the $search aggregation stage.""" + embedding_response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + query_embedding = embedding_response.data[0].embedding + + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + return list(collection.aggregate(pipeline)) + + +def print_search_results(results: List[Dict[str, Any]], algorithm: str) -> None: + """Print formatted search results.""" + print(f"\n{'='*60}") + print(f" {algorithm} Search Results ({len(results)} found)") + print(f"{'='*60}") + for i, result in enumerate(results, 1): + doc = result.get('document', result) + name = doc.get('HotelName', doc.get('name', 'Unknown')) + score = result.get('score', 0) + print(f" {i}. {name} (score: {score:.4f})") + print() diff --git a/ai/select-algorithm-typescript/.env.example b/ai/select-algorithm-typescript/.env.example new file mode 100644 index 0000000..a88b834 --- /dev/null +++ b/ai/select-algorithm-typescript/.env.example @@ -0,0 +1,11 @@ +MONGO_CLUSTER_NAME=your-cluster-name +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 +AZURE_DOCUMENTDB_DATABASENAME=Hotels +DATA_FILE_WITH_VECTORS=./Hotels_Vector.json +EMBEDDED_FIELD=DescriptionVector +EMBEDDING_DIMENSIONS=1536 +LOAD_SIZE_BATCH=100 +# SIMILARITY - leave empty to run all similarity types, or set to: COS, L2, IP +SIMILARITY= diff --git a/ai/select-algorithm-typescript/.gitignore b/ai/select-algorithm-typescript/.gitignore new file mode 100644 index 0000000..9a088e4 --- /dev/null +++ b/ai/select-algorithm-typescript/.gitignore @@ -0,0 +1,8 @@ +node_modules/ +dist/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md +Hotels_Vector.json diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md new file mode 100644 index 0000000..b9140cb --- /dev/null +++ b/ai/select-algorithm-typescript/README.md @@ -0,0 +1,116 @@ +# Select Algorithm — TypeScript + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB using TypeScript. + +## Prerequisites + +- [Node.js 20+](https://nodejs.org/) +- [Azure CLI](https://learn.microsoft.com/cli/azure/install-azure-cli) (for `az login`) +- An Azure DocumentDB cluster with vector search enabled +- An Azure OpenAI resource with an embedding model deployed + +## Setup + +1. **Install dependencies:** + + ```bash + npm install + ``` + +2. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +3. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file in the project folder with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + | Variable | Description | + |---|---| + | `MONGO_CLUSTER_NAME` | Your DocumentDB cluster name | + | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | + | `AZURE_OPENAI_EMBEDDING_MODEL` | Embedding model deployment name | + | `AZURE_OPENAI_EMBEDDING_API_VERSION` | Azure OpenAI API version | + | `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | + | `DATA_FILE_WITH_VECTORS` | Path to JSON data file with vectors | + | `EMBEDDED_FIELD` | Field name containing the vector (default: `DescriptionVector`) | + | `EMBEDDING_DIMENSIONS` | Vector dimensions (default: `1536`) | + | `LOAD_SIZE_BATCH` | Batch size for data insertion | + | `SIMILARITY` | Similarity metric: `COS`, `L2`, or `IP` | + +5. **Copy the shared data file** into this directory: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + + The `DATA_FILE_WITH_VECTORS` env var defaults to `../data/Hotels_Vector.json`. + +6. **Build the project:** + + ```bash + npm run build + ``` + +## Run + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table: + +```bash +npm start +``` + +**Environment variables** (optional overrides): + +| Variable | Default | Description | +|---|---|---| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per combination | +| `VERBOSE` | `false` | When `true`, shows all k results per combo | + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +**Output:** +``` +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +Algorithm Similarity #1 Result #1 Score #2 Result #2 Score Diff +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +## Algorithm comparison + +| Algorithm | Index type | Best for | +|---|---|---| +| **IVF** | `vector-ivf` | Smaller datasets, lower memory usage | +| **HNSW** | `vector-hnsw` | Fast approximate search, balanced recall/speed | +| **DiskANN** | `vector-diskann` | Large-scale datasets, disk-based search | diff --git a/ai/select-algorithm-typescript/data/README.md b/ai/select-algorithm-typescript/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-typescript/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-typescript/output/compare_all.txt b/ai/select-algorithm-typescript/output/compare_all.txt new file mode 100644 index 0000000..8e34340 --- /dev/null +++ b/ai/select-algorithm-typescript/output/compare_all.txt @@ -0,0 +1,42 @@ +Using Azure OpenAI Embedding Deployment/Model: text-embedding-3-small +Reading JSON file from data/Hotels_Vector.json +Loaded 50 documents +Processing in batches of 50... +Batch 1 complete: 50 inserted + +Query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running searches (top 5 results)... ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection "hotels" +Database connection closed diff --git a/ai/select-algorithm-typescript/package-lock.json b/ai/select-algorithm-typescript/package-lock.json new file mode 100644 index 0000000..f0ceb74 --- /dev/null +++ b/ai/select-algorithm-typescript/package-lock.json @@ -0,0 +1,735 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "select-algorithm-typescript", + "version": "1.0.0", + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } + }, + "node_modules/@azure/abort-controller": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.1.2.tgz", + "integrity": "sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-auth": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.10.1.tgz", + "integrity": "sha512-ykRMW8PjVAn+RS6ww5cmK9U2CyH9p4Q88YJwvUslfuMmN98w/2rdGRLPqJYObapBCdzBVeDgYWdJnFPFb7qzpg==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-util": "^1.13.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-client": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-client/-/core-client-1.10.1.tgz", + "integrity": "sha512-Nh5PhEOeY6PrnxNPsEHRr9eimxLwgLlpmguQaHKBinFYA/RU9+kOYVOQqOrTsCL+KSxrLLl1gD8Dk5BFW/7l/w==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-rest-pipeline": "^1.22.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-rest-pipeline": { + "version": "1.23.0", + "resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.23.0.tgz", + "integrity": "sha512-Evs1INHo+jUjwHi1T6SG6Ua/LHOQBCLuKEEE6efIpt4ZOoNonaT1kP32GoOcdNDbfqsD2445CPri3MubBy5DEQ==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "@typespec/ts-http-runtime": "^0.3.4", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-tracing": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.3.1.tgz", + "integrity": "sha512-9MWKevR7Hz8kNzzPLfX4EAtGM2b8mr50HPDBvio96bURP/9C+HjdH3sBlLSNNrvRAr5/k/svoH457gB5IKpmwQ==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-util": { + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.13.1.tgz", + "integrity": "sha512-XPArKLzsvl0Hf0CaGyKHUyVgF7oDnhKoP85Xv6M4StF/1AhfORhZudHtOyf2s+FcbuQ9dPRAjB8J2KvRRMUK2A==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/identity": { + "version": "4.13.1", + "resolved": "https://registry.npmjs.org/@azure/identity/-/identity-4.13.1.tgz", + "integrity": "sha512-5C/2WD5Vb1lHnZS16dNQRPMjN6oV/Upba+C9nBIs15PmOi6A3ZGs4Lr2u60zw4S04gi+u3cEXiqTVP7M4Pz3kw==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-auth": "^1.9.0", + "@azure/core-client": "^1.9.2", + "@azure/core-rest-pipeline": "^1.17.0", + "@azure/core-tracing": "^1.0.0", + "@azure/core-util": "^1.11.0", + "@azure/logger": "^1.0.0", + "@azure/msal-browser": "^5.5.0", + "@azure/msal-node": "^5.1.0", + "open": "^10.1.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/logger": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.3.0.tgz", + "integrity": "sha512-fCqPIfOcLE+CGqGPd66c8bZpwAji98tZ4JI9i/mlTNTlsIWslCfpg48s/ypyLxZTump5sypjrKn2/kY7q8oAbA==", + "license": "MIT", + "dependencies": { + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/msal-browser": { + "version": "5.9.0", + "resolved": "https://registry.npmjs.org/@azure/msal-browser/-/msal-browser-5.9.0.tgz", + "integrity": "sha512-CzE+4PefDSJWj26zU7G1bKchlGRRHMBFreG4tAlGuzyI8hAPiYGobaJvZBgZBf6L63iphX7VH+ityL8VgEQz9Q==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-common": { + "version": "16.5.2", + "resolved": "https://registry.npmjs.org/@azure/msal-common/-/msal-common-16.5.2.tgz", + "integrity": "sha512-GkDEL6TYo3HgT3UuqakdgE9PZfc1hMki6+Hwgy1uddb/EauvAKfu85vVhuofRSo22D1xTnWt8Ucwfg4vSCVwvA==", + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-node": { + "version": "5.1.5", + "resolved": "https://registry.npmjs.org/@azure/msal-node/-/msal-node-5.1.5.tgz", + "integrity": "sha512-ObTeMoNPmq19X3z40et9Xvs4ZoWVeJg43PZMRLG5iwVL+2nCtAerG3YTDItqPp1CfXNwmCXBbg8jn1DOx65c3g==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2", + "jsonwebtoken": "^9.0.0" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/@mongodb-js/saslprep": { + "version": "1.4.9", + "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.4.9.tgz", + "integrity": "sha512-RXSxsokhAF/4nWys8An8npsqOI33Ex1Hlzqjw2pZOO+GKtMAR2noGnUdsFiGwsaO/xXI+56mtjTmDA3JXJsvmA==", + "license": "MIT", + "dependencies": { + "sparse-bitfield": "^3.0.3" + } + }, + "node_modules/@types/node": { + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/@types/webidl-conversions": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", + "integrity": "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==", + "license": "MIT" + }, + "node_modules/@types/whatwg-url": { + "version": "11.0.5", + "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz", + "integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==", + "license": "MIT", + "dependencies": { + "@types/webidl-conversions": "*" + } + }, + "node_modules/@typespec/ts-http-runtime": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@typespec/ts-http-runtime/-/ts-http-runtime-0.3.5.tgz", + "integrity": "sha512-yURCknZhvywvQItHMMmFSo+fq5arCUIyz/CVk7jD89MSai7dkaX8ufjCWp3NttLojoTVbcE72ri+be/TnEbMHw==", + "license": "MIT", + "dependencies": { + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/bson": { + "version": "6.10.4", + "resolved": "https://registry.npmjs.org/bson/-/bson-6.10.4.tgz", + "integrity": "sha512-WIsKqkSC0ABoBJuT1LEX+2HEvNmNKKgnTAyd0fL8qzK4SH2i9NXg+t08YtdZp/V9IZ33cxe3iV4yM0qg8lMQng==", + "license": "Apache-2.0", + "engines": { + "node": ">=16.20.1" + } + }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", + "license": "BSD-3-Clause" + }, + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "license": "MIT", + "dependencies": { + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/default-browser": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", + "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", + "license": "MIT", + "dependencies": { + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", + "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/is-docker": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "license": "MIT", + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "license": "MIT", + "dependencies": { + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-wsl": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.1.tgz", + "integrity": "sha512-e6rvdUCiQCAuumZslxRJWR/Doq4VpPR82kqclvcS0efgt430SlGIk05vdCN58+VrzgtIcfNODjozVielycD4Sw==", + "license": "MIT", + "dependencies": { + "is-inside-container": "^1.0.0" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/jsonwebtoken": { + "version": "9.0.3", + "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", + "integrity": "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g==", + "license": "MIT", + "dependencies": { + "jws": "^4.0.1", + "lodash.includes": "^4.3.0", + "lodash.isboolean": "^3.0.3", + "lodash.isinteger": "^4.0.4", + "lodash.isnumber": "^3.0.3", + "lodash.isplainobject": "^4.0.6", + "lodash.isstring": "^4.0.1", + "lodash.once": "^4.0.0", + "ms": "^2.1.1", + "semver": "^7.5.4" + }, + "engines": { + "node": ">=12", + "npm": ">=6" + } + }, + "node_modules/jwa": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", + "license": "MIT", + "dependencies": { + "buffer-equal-constant-time": "^1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", + "license": "MIT", + "dependencies": { + "jwa": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/lodash.includes": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.includes/-/lodash.includes-4.3.0.tgz", + "integrity": "sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==", + "license": "MIT" + }, + "node_modules/lodash.isboolean": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz", + "integrity": "sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==", + "license": "MIT" + }, + "node_modules/lodash.isinteger": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/lodash.isinteger/-/lodash.isinteger-4.0.4.tgz", + "integrity": "sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==", + "license": "MIT" + }, + "node_modules/lodash.isnumber": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isnumber/-/lodash.isnumber-3.0.3.tgz", + "integrity": "sha512-QYqzpfwO3/CWf3XP+Z+tkQsfaLL/EnUlXWVkIk5FUPc4sBdTehEqZONuyRt2P67PXAk+NXmTBcc97zw9t1FQrw==", + "license": "MIT" + }, + "node_modules/lodash.isplainobject": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz", + "integrity": "sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==", + "license": "MIT" + }, + "node_modules/lodash.isstring": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.isstring/-/lodash.isstring-4.0.1.tgz", + "integrity": "sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==", + "license": "MIT" + }, + "node_modules/lodash.once": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", + "integrity": "sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==", + "license": "MIT" + }, + "node_modules/memory-pager": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz", + "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==", + "license": "MIT" + }, + "node_modules/mongodb": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.21.0.tgz", + "integrity": "sha512-URyb/VXMjJ4da46OeSXg+puO39XH9DeQpWCslifrRn9JWugy0D+DvvBvkm2WxmHe61O/H19JM66p1z7RHVkZ6A==", + "license": "Apache-2.0", + "dependencies": { + "@mongodb-js/saslprep": "^1.3.0", + "bson": "^6.10.4", + "mongodb-connection-string-url": "^3.0.2" + }, + "engines": { + "node": ">=16.20.1" + }, + "peerDependencies": { + "@aws-sdk/credential-providers": "^3.188.0", + "@mongodb-js/zstd": "^1.1.0 || ^2.0.0", + "gcp-metadata": "^5.2.0", + "kerberos": "^2.0.1", + "mongodb-client-encryption": ">=6.0.0 <7", + "snappy": "^7.3.2", + "socks": "^2.7.1" + }, + "peerDependenciesMeta": { + "@aws-sdk/credential-providers": { + "optional": true + }, + "@mongodb-js/zstd": { + "optional": true + }, + "gcp-metadata": { + "optional": true + }, + "kerberos": { + "optional": true + }, + "mongodb-client-encryption": { + "optional": true + }, + "snappy": { + "optional": true + }, + "socks": { + "optional": true + } + } + }, + "node_modules/mongodb-connection-string-url": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.2.tgz", + "integrity": "sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA==", + "license": "Apache-2.0", + "dependencies": { + "@types/whatwg-url": "^11.0.2", + "whatwg-url": "^14.1.0 || ^13.0.0" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/open": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz", + "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==", + "license": "MIT", + "dependencies": { + "default-browser": "^5.2.1", + "define-lazy-prop": "^3.0.0", + "is-inside-container": "^1.0.0", + "wsl-utils": "^0.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/openai": { + "version": "5.23.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.23.2.tgz", + "integrity": "sha512-MQBzmTulj+MM5O8SKEk/gL8a7s5mktS9zUtAkU257WjvobGc9nKcBuVwjyEEcb9SI8a8Y2G/mzn3vm9n1Jlleg==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/run-applescript": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.1.0.tgz", + "integrity": "sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/sparse-bitfield": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz", + "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==", + "license": "MIT", + "dependencies": { + "memory-pager": "^1.0.2" + } + }, + "node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "dev": true, + "license": "MIT" + }, + "node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "license": "MIT", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/wsl-utils": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz", + "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==", + "license": "MIT", + "dependencies": { + "is-wsl": "^3.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + } + } +} diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json new file mode 100644 index 0000000..df5b82b --- /dev/null +++ b/ai/select-algorithm-typescript/package.json @@ -0,0 +1,20 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "description": "Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB", + "type": "module", + "scripts": { + "env:init": "azd env get-values > .env", + "build": "tsc", + "start": "node --env-file .env dist/compare-all.js" + }, + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } +} diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts new file mode 100644 index 0000000..64c9bb8 --- /dev/null +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -0,0 +1,232 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +interface AlgorithmConfig { + name: string; + kind: string; + options: Record; +} + +interface SearchResult { + algorithm: string; + similarity: string; + top1Name: string; + top1Score: number; + top2Name: string; + top2Score: number; +} + +const ALGORITHMS: AlgorithmConfig[] = [ + { name: 'IVF', kind: 'vector-ivf', options: { numLists: 1 } }, + { name: 'HNSW', kind: 'vector-hnsw', options: { m: 16, efConstruction: 64 } }, + { name: 'DiskANN', kind: 'vector-diskann', options: { maxDegree: 20, lBuild: 10 } }, +]; + +const SIMILARITIES = ['COS', 'L2', 'IP']; + +async function main() { + const baseConfig = getConfig(); + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '5', 10); + const collectionName = 'hotels'; + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(baseConfig.dbName); + + // Drop collection if it exists for a clean start + let collections = await db.listCollections({ name: collectionName }).toArray(); + if (collections.length > 0) { + try { + const col = db.collection(collectionName); + const existingIndexes = await col.listIndexes().toArray(); + for (const idx of existingIndexes) { + if (idx.name !== '_id_') { + try { + await col.dropIndex(idx.name); + } catch {} + } + } + await new Promise(r => setTimeout(r, 2000)); + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); + } catch (e: any) { + console.log(`Cleanup note: ${e.message.split('\n')[0]}`); + } + await new Promise(r => setTimeout(r, 10000)); + } + + // Load data once for reuse + const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); + console.log(`Loaded ${data.length} documents`); + + // Insert data into collection + const collection = db.collection(collectionName); + await insertData(baseConfig, collection, data); + + // Generate one embedding for the query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: baseConfig.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + console.log(`Embedding generated (${queryVector.length} dimensions)`); + + // Sequential create→search→drop for each algorithm+similarity combo + // DocumentDB does not allow multiple vector indexes of the same kind on the same field + console.log(`\nRunning searches (top ${topK} results)...\n`); + const results: SearchResult[] = []; + + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + + // 1. Drop all existing vector indexes + const indexes = await collection.listIndexes().toArray(); + let droppedAny = false; + for (const idx of indexes) { + if (idx.key && idx.key[baseConfig.embeddedField] === 'cosmosSearch') { + try { await collection.dropIndex(idx.name); droppedAny = true; } catch {} + } + } + if (droppedAny) { + await new Promise(r => setTimeout(r, 2000)); + } + + // 2. Create this specific index + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [baseConfig.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: algo.kind, + ...algo.options, + similarity: sim, + dimensions: baseConfig.embeddingDimensions + } + }] + }; + await db.command(indexOptions); + console.log(` ✓ ${indexName} created`); + + // 3. Wait for index to be ready + await new Promise(r => setTimeout(r, 5000)); + + // 4. Search with retry (index may need more time) + let searchResults: any[] = []; + for (let attempt = 0; attempt < 3; attempt++) { + if (attempt > 0) { + await new Promise(r => setTimeout(r, 5000)); + } + try { + searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]).toArray(); + if (searchResults.length > 0) break; + } catch (e) { + if (attempt === 2) throw e; + } + } + + // Record top 2 results + const top1 = searchResults[0] as any; + const top2 = searchResults[1] as any; + results.push({ + algorithm: algo.name, + similarity: sim, + top1Name: top1?.document?.HotelName ?? '(none)', + top1Score: top1?.score ?? 0, + top2Name: top2?.document?.HotelName ?? '(none)', + top2Score: top2?.score ?? 0, + }); + } + } + + // Print comparison table + printComparisonTable(results); + + } catch (error) { + console.error('Compare-all failed:', error); + process.exitCode = 1; + } finally { + // Cleanup: drop the comparison collection + if (dbClient) { + try { + const db = dbClient.db(baseConfig.dbName); + await db.dropCollection(collectionName); + console.log(`\nCleanup: dropped collection "${collectionName}"`); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } + } +} + +function printComparisonTable(results: SearchResult[]) { + const algoW = 10; + const simW = 8; + const name1W = 28; + const score1W = 8; + const name2W = 28; + const score2W = 8; + const diffW = 7; + + const pad = (s: string, w: number) => s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length); + + const cols = [algoW, simW, name1W, score1W, name2W, score2W, diffW]; + const topLine = `┌${cols.map(w => '─'.repeat(w)).join('┬')}┐`; + const headerSep = `├${cols.map(w => '─'.repeat(w)).join('┼')}┤`; + const rowSep = `├${cols.map(w => '─'.repeat(w)).join('┼')}┤`; + const bottomLine = `└${cols.map(w => '─'.repeat(w)).join('┴')}┘`; + + console.log(topLine); + console.log( + `│${pad(' Algorithm', algoW)}│${pad(' Metric', simW)}│${pad(' Top 1 Result', name1W)}│${pad(' Score', score1W)}│${pad(' Top 2 Result', name2W)}│${pad(' Score', score2W)}│${pad(' Diff', diffW)}│` + ); + console.log(headerSep); + + results.forEach((r, i) => { + const diff = Math.abs(r.top1Score - r.top2Score).toFixed(4); + console.log( + `│${pad(` ${r.algorithm}`, algoW)}│${pad(` ${r.similarity}`, simW)}│${pad(` ${r.top1Name}`, name1W)}│${pad(` ${r.top1Score.toFixed(4)}`, score1W)}│${pad(` ${r.top2Name}`, name2W)}│${pad(` ${r.top2Score.toFixed(4)}`, score2W)}│${pad(` ${diff}`, diffW)}│` + ); + if (i < results.length - 1) { + console.log(rowSep); + } + }); + + console.log(bottomLine); +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/select-algorithm.ts b/ai/select-algorithm-typescript/src/select-algorithm.ts new file mode 100644 index 0000000..fc5a583 --- /dev/null +++ b/ai/select-algorithm-typescript/src/select-algorithm.ts @@ -0,0 +1,287 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, insertData, printComparisonTable } from './utils.js'; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Validate required environment variables at startup +const requiredEnvVars = [ + 'MONGO_CLUSTER_NAME', + 'AZURE_OPENAI_EMBEDDING_ENDPOINT', + 'AZURE_OPENAI_EMBEDDING_MODEL', + 'DATA_FILE_WITH_VECTORS' +]; + +const missing = requiredEnvVars.filter(v => !process.env[v]); +if (missing.length > 0) { + console.error(`Missing required environment variables: ${missing.join(', ')}`); + console.error('See .env.example for required values.'); + process.exit(1); +} + +type Algorithm = 'diskann' | 'hnsw' | 'ivf'; +type Similarity = 'COS' | 'L2' | 'IP'; + +const ALGORITHMS: Algorithm[] = ['diskann', 'hnsw', 'ivf']; +const SIMILARITIES: Similarity[] = ['COS', 'L2', 'IP']; + +const ALGORITHM_LABELS: Record = { + diskann: 'DiskANN', + hnsw: 'HNSW', + ivf: 'IVF', +}; + +// Index creation configs per algorithm +function getIndexOptions( + collectionName: string, + indexName: string, + embeddedField: string, + dimensions: number, + algorithm: Algorithm, + similarity: Similarity +) { + const base = { + createIndexes: collectionName, + indexes: [ + { + name: indexName, + key: { [embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: {} as Record, + }, + ], + }; + + switch (algorithm) { + case 'diskann': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-diskann', + dimensions, + similarity, + maxDegree: 20, + lBuild: 10, + }; + break; + case 'hnsw': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-hnsw', + dimensions, + similarity, + m: 16, + efConstruction: 64, + }; + break; + case 'ivf': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-ivf', + dimensions, + similarity, + numLists: 1, + }; + break; + } + + return base; +} + +// Algorithm-specific query params +function getSearchPipeline( + queryEmbedding: number[], + embeddedField: string, + k: number, + algorithm: Algorithm +) { + const cosmosSearch: Record = { + vector: queryEmbedding, + path: embeddedField, + k, + }; + + // Add algorithm-specific search params + switch (algorithm) { + case 'diskann': + cosmosSearch.lSearch = 100; + break; + case 'hnsw': + cosmosSearch.efSearch = 80; + break; + case 'ivf': + cosmosSearch.nProbes = 1; + break; + } + + return [ + { $search: { cosmosSearch } }, + { $project: { score: { $meta: "searchScore" }, document: "$$ROOT" } }, + ]; +} + +/** + * Determine which collections to create/query based on ALGORITHM and SIMILARITY env vars. + * Collection naming: hotels_{algorithm}_{similarity} + */ +function getTargetCollections( + algorithmEnv: string, + similarityEnv: string +): Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> { + const algorithms: Algorithm[] = + !algorithmEnv ? ALGORITHMS : [algorithmEnv as Algorithm]; + const similarities: Similarity[] = + !similarityEnv ? SIMILARITIES : [similarityEnv as Similarity]; + + const targets: Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> = []; + + for (const alg of algorithms) { + if (!ALGORITHMS.includes(alg)) { + throw new Error(`Invalid ALGORITHM '${alg}'. Must be one of: ${ALGORITHMS.join(', ')}`); + } + for (const sim of similarities) { + if (!SIMILARITIES.includes(sim)) { + throw new Error(`Invalid SIMILARITY '${sim}'. Must be one of: ${SIMILARITIES.join(', ')}`); + } + targets.push({ + collectionName: `hotels_${alg}_${sim.toLowerCase()}`, + algorithm: alg, + similarity: sim, + }); + } + } + + return targets; +} + +async function main() { + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) { + throw new Error('Azure OpenAI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + const dbName = process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels'; + const embeddedField = process.env.EMBEDDED_FIELD || 'DescriptionVector'; + const embeddingDimensions = parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10); + const dataFile = process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json'; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const batchSize = parseInt(process.env.LOAD_SIZE_BATCH || '100', 10); + const algorithmEnv = (process.env.ALGORITHM || '').trim().toLowerCase(); + const similarityEnv = (process.env.SIMILARITY || '').trim().toUpperCase(); + const searchQuery = 'quintessential lodging near running trails, eateries, retail'; + + const targets = getTargetCollections(algorithmEnv, similarityEnv); + + console.log(`\n🔬 Vector Algorithm Comparison`); + console.log(` Database: ${dbName}`); + console.log(` Algorithms: ${algorithmEnv}`); + console.log(` Similarity: ${similarityEnv}`); + console.log(` Collections to query: ${targets.map(t => t.collectionName).join(', ')}`); + console.log(` Search query: "${searchQuery}"\n`); + + await dbClient.connect(); + const db = dbClient.db(dbName); + + // Load data once (shared across collections) + const data = await readFileReturnJson(path.join(__dirname, '..', dataFile)); + + // Generate query embedding once (reuse across collections) + console.log('Generating query embedding...'); + const embeddingResponse = await aiClient.embeddings.create({ + model: deployment, + input: [searchQuery], + }); + const queryEmbedding = embeddingResponse.data[0].embedding; + if (queryEmbedding.length !== embeddingDimensions) { + throw new Error( + `Embedding dimension mismatch: expected ${embeddingDimensions}, got ${queryEmbedding.length}. ` + + `Verify AZURE_OPENAI_EMBEDDING_MODEL matches the configured EMBEDDING_DIMENSIONS.` + ); + } + console.log(`Query embedding: ${queryEmbedding.length} dimensions\n`); + + const config = { batchSize }; + + const comparisonResults: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> = []; + + for (const target of targets) { + console.log(`\n━━━ ${ALGORITHM_LABELS[target.algorithm]} / ${target.similarity} ━━━`); + console.log(`Collection: ${target.collectionName}`); + + try { + // Create collection (drops existing to ensure clean state) + try { + await db.dropCollection(target.collectionName); + } catch { + // Collection may not exist yet + } + const collection = await db.createCollection(target.collectionName); + console.log('Created collection:', target.collectionName); + + // Insert data + const insertSummary = await insertData(config, collection, data); + console.log(`Inserted: ${insertSummary.inserted}/${insertSummary.total}`); + + // Create vector index + const indexName = `vectorIndex_${target.algorithm}_${target.similarity.toLowerCase()}`; + const indexOptions = getIndexOptions( + target.collectionName, + indexName, + embeddedField, + embeddingDimensions, + target.algorithm, + target.similarity + ); + await db.command(indexOptions); + console.log('Created vector index:', indexName); + + // Run vector search + console.log('Executing vector search...'); + const startTime = Date.now(); + + const pipeline = getSearchPipeline(queryEmbedding, embeddedField, 5, target.algorithm); + const searchResults = await collection.aggregate(pipeline).toArray(); + + const latencyMs = Date.now() - startTime; + + comparisonResults.push({ + collectionName: target.collectionName, + algorithm: ALGORITHM_LABELS[target.algorithm], + similarity: target.similarity, + searchResults, + latencyMs, + }); + + console.log(`✓ ${searchResults.length} results, ${latencyMs}ms`); + } catch (error) { + console.error(`✗ Error with ${target.collectionName}:`, (error as Error).message); + } + } + + // Print comparison table + if (comparisonResults.length > 0) { + printComparisonTable(comparisonResults); + } + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('\nClosing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/utils.ts b/ai/select-algorithm-typescript/src/utils.ts new file mode 100644 index 0000000..f10ea77 --- /dev/null +++ b/ai/select-algorithm-typescript/src/utils.ts @@ -0,0 +1,205 @@ +import { Collection, Document, MongoClient, OIDCResponse, OIDCCallbackParams } from 'mongodb'; +import { AzureOpenAI } from 'openai'; +import { promises as fs } from "fs"; +import { AccessToken, DefaultAzureCredential, TokenCredential, getBearerTokenProvider } from '@azure/identity'; + +// Define a type for JSON data +export type JsonData = Record; + +export function getConfig() { + return { + dbName: process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels', + dataFile: process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json', + embeddedField: process.env.EMBEDDED_FIELD || 'DescriptionVector', + similarity: process.env.SIMILARITY || '', + embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10), + deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL || 'text-embedding-3-small', + batchSize: parseInt(process.env.LOAD_SIZE_BATCH || '100', 10) + }; +} + +export const AzureIdentityTokenCallback = async (params: OIDCCallbackParams, credential: TokenCredential): Promise => { + const tokenResponse: AccessToken | null = await credential.getToken(['https://ossrdbms-aad.database.windows.net/.default']); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: Math.floor(((tokenResponse?.expiresOnTimestamp || 0) - Date.now()) / 1000) + }; +}; + +export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClient: MongoClient | null } { + let aiClient: AzureOpenAI | null = null; + let dbClient: MongoClient | null = null; + + // Validate all required environment variables upfront + const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const clusterName = process.env.MONGO_CLUSTER_NAME!; + + if (!endpoint || !deployment || !clusterName) { + throw new Error('Missing required environment variables: AZURE_OPENAI_EMBEDDING_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL, MONGO_CLUSTER_NAME'); + } + + console.log(`Using Azure OpenAI Embedding Deployment/Model: ${deployment}`); + + const credential = new DefaultAzureCredential(); + + // For Azure OpenAI with DefaultAzureCredential + { + const scope = "https://cognitiveservices.azure.com/.default"; + const azureADTokenProvider = getBearerTokenProvider(credential, scope); + aiClient = new AzureOpenAI({ + apiVersion: process.env.AZURE_OPENAI_EMBEDDING_API_VERSION || "2023-05-15", + endpoint, + deployment, + azureADTokenProvider, + timeout: 30000, + maxRetries: 3, + }); + } + + // For DocumentDB with DefaultAzureCredential (uses signed-in user) + { + dbClient = new MongoClient( + `mongodb+srv://${clusterName}.global.mongocluster.cosmos.azure.com/`, { + connectTimeoutMS: 120000, + tls: true, + retryWrites: false, + maxIdleTimeMS: 120000, + authMechanism: 'MONGODB-OIDC', + authMechanismProperties: { + OIDC_CALLBACK: (params: OIDCCallbackParams) => AzureIdentityTokenCallback(params, credential), + ALLOWED_HOSTS: ['*.azure.com'] + } + } + ); + } + + return { aiClient, dbClient }; +} + +export async function readFileReturnJson(filePath: string): Promise { + + console.log(`Reading JSON file from ${filePath}`); + + const fileAsString = await fs.readFile(filePath, "utf-8"); + return JSON.parse(fileAsString); +} + +export async function insertData(config: { batchSize: number }, collection: Collection, data: Document[]) { + console.log(`Processing in batches of ${config.batchSize}...`); + const totalBatches = Math.ceil(data.length / config.batchSize); + + let inserted = 0; + let failed = 0; + + for (let i = 0; i < totalBatches; i++) { + const start = i * config.batchSize; + const end = Math.min(start + config.batchSize, data.length); + const batch = data.slice(start, end); + + try { + const result = await collection.insertMany(batch, { ordered: false }); + inserted += result.insertedCount || 0; + console.log(`Batch ${i + 1} complete: ${result.insertedCount} inserted`); + } catch (error: any) { + if (error?.writeErrors) { + console.error(`Error in batch ${i + 1}: ${error?.writeErrors.length} failures`); + failed += error?.writeErrors.length; + inserted += batch.length - error?.writeErrors.length; + } else { + console.error(`Error in batch ${i + 1}:`, error); + failed += batch.length; + } + } + + // Small pause between batches to reduce resource contention + if (i < totalBatches - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + // Create standard field indexes + const indexColumns = ["HotelId", "Category", "Description", "Description_fr"]; + for (const col of indexColumns) { + const indexSpec: Record = {}; + indexSpec[col] = 1; + await collection.createIndex(indexSpec); + } + + return { total: data.length, inserted, failed }; +} + +export function printSearchResults(insertSummary: any, vectorIndexSummary: any, searchResults: Document[]) { + console.log(`\nInsert summary: ${JSON.stringify(insertSummary)}`); + console.log(`Vector index: ${JSON.stringify(vectorIndexSummary)}`); + + if (!searchResults || searchResults.length === 0) { + console.log('No search results found.'); + return; + } + + searchResults.map((result: Document, index: number) => { + const { document, score } = result; + console.log(`${index + 1}. HotelName: ${document.HotelName}, Score: ${score.toFixed(4)}`); + }); +} + +/** + * Print a side-by-side comparison table of vector search results across collections + */ +export function printComparisonTable( + results: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> +): void { + console.log('\n╔══════════════════════════════════════════════════════════════════════════════════╗'); + console.log('║ Vector Algorithm Comparison Results ║'); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + // Header + console.log( + '║ ' + + 'Algorithm'.padEnd(12) + + 'Similarity'.padEnd(14) + + 'Top Result'.padEnd(24) + + 'Score'.padEnd(12) + + 'Latency(ms)'.padEnd(14) + + '║' + ); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + for (const r of results) { + const topResult = r.searchResults[0]; + const topName = topResult ? (topResult.document.HotelName as string).substring(0, 22) : 'N/A'; + const topScore = topResult ? topResult.score.toFixed(4) : 'N/A'; + + console.log( + '║ ' + + r.algorithm.padEnd(12) + + r.similarity.padEnd(14) + + topName.padEnd(24) + + topScore.padEnd(12) + + r.latencyMs.toFixed(0).padEnd(14) + + '║' + ); + } + + console.log('╚══════════════════════════════════════════════════════════════════════════════════╝'); + + // Detailed results per collection + for (const r of results) { + console.log(`\n--- ${r.algorithm} / ${r.similarity} (${r.collectionName}) ---`); + if (r.searchResults.length === 0) { + console.log(' No results.'); + continue; + } + r.searchResults.forEach((item: Document, i: number) => { + console.log(` ${i + 1}. ${item.document.HotelName}, Score: ${item.score.toFixed(4)}`); + }); + console.log(` Latency: ${r.latencyMs.toFixed(0)}ms`); + } +} diff --git a/ai/select-algorithm-typescript/tsconfig.json b/ai/select-algorithm-typescript/tsconfig.json new file mode 100644 index 0000000..3cb9aaa --- /dev/null +++ b/ai/select-algorithm-typescript/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "NodeNext", + "moduleResolution": "nodenext", + "declaration": true, + "outDir": "./dist", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "noImplicitAny": false, + "forceConsistentCasingInFileNames": true, + "sourceMap": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +}