diff --git a/docs/api/vectorizer.rst b/docs/api/vectorizer.rst index 6bc2d6de..e7167efc 100644 --- a/docs/api/vectorizer.rst +++ b/docs/api/vectorizer.rst @@ -2,6 +2,19 @@ Vectorizers *********** +.. note:: + **Backwards Compatibility:** Several vectorizers have deprecated aliases + available in the ``redisvl.utils.vectorize.text`` module for backwards + compatibility: + + - ``VoyageAITextVectorizer`` → Use ``VoyageAIVectorizer`` instead + - ``VertexAITextVectorizer`` → Use ``VertexAIVectorizer`` instead + - ``BedrockTextVectorizer`` → Use ``BedrockVectorizer`` instead + - ``CustomTextVectorizer`` → Use ``CustomVectorizer`` instead + + These aliases are deprecated as of version 0.13.0 and will be removed + in a future major release. + HFTextVectorizer ================ @@ -38,14 +51,19 @@ AzureOpenAITextVectorizer :members: -VertexAITextVectorizer +VertexAIVectorizer ====================== -.. _vertexaitextvectorizer_api: +.. _vertexaivectorizer_api: + +.. currentmodule:: redisvl.utils.vectorize.vertexai -.. currentmodule:: redisvl.utils.vectorize.text.vertexai +.. note:: + For backwards compatibility, an alias ``VertexAITextVectorizer`` is available + in the ``redisvl.utils.vectorize.text`` module. This alias is deprecated + as of version 0.13.0 and will be removed in a future major release. -.. autoclass:: VertexAITextVectorizer +.. autoclass:: VertexAIVectorizer :show-inheritance: :members: @@ -62,37 +80,64 @@ CohereTextVectorizer :members: -BedrockTextVectorizer +BedrockVectorizer ===================== -.. _bedrocktextvectorizer_api: +.. _bedrockvectorizer_api: -.. currentmodule:: redisvl.utils.vectorize.text.bedrock +.. currentmodule:: redisvl.utils.vectorize.bedrock -.. autoclass:: BedrockTextVectorizer +.. note:: + For backwards compatibility, an alias ``BedrockTextVectorizer`` is available + in the ``redisvl.utils.vectorize.text`` module. This alias is deprecated + as of version 0.13.0 and will be removed in a future major release. + +.. autoclass:: BedrockVectorizer :show-inheritance: :members: -CustomTextVectorizer +CustomVectorizer ==================== -.. _customtextvectorizer_api: +.. _customvectorizer_api: + +.. currentmodule:: redisvl.utils.vectorize.custom -.. currentmodule:: redisvl.utils.vectorize.text.custom +.. note:: + For backwards compatibility, an alias ``CustomTextVectorizer`` is available + in the ``redisvl.utils.vectorize.text`` module. This alias is deprecated + as of version 0.13.0 and will be removed in a future major release. -.. autoclass:: CustomTextVectorizer +.. autoclass:: CustomVectorizer :show-inheritance: :members: -VoyageAITextVectorizer +VoyageAIVectorizer ====================== -.. _voyageaitextvectorizer_api: +.. _voyageaivectorizer_api: + +.. currentmodule:: redisvl.utils.vectorize.voyageai + +.. note:: + For backwards compatibility, an alias ``VoyageAITextVectorizer`` is available + in the ``redisvl.utils.vectorize.text`` module. This alias is deprecated + as of version 0.13.0 and will be removed in a future major release. + +.. autoclass:: VoyageAIVectorizer + :show-inheritance: + :members: + + +MistralAITextVectorizer +======================== + +.. _mistralaitextvectorizer_api: -.. currentmodule:: redisvl.utils.vectorize.text.voyageai +.. currentmodule:: redisvl.utils.vectorize.text.mistral -.. autoclass:: VoyageAITextVectorizer +.. autoclass:: MistralAITextVectorizer :show-inheritance: :members: diff --git a/docs/user_guide/04_vectorizers.ipynb b/docs/user_guide/04_vectorizers.ipynb index 03097fad..fbb1b03f 100644 --- a/docs/user_guide/04_vectorizers.ipynb +++ b/docs/user_guide/04_vectorizers.ipynb @@ -1,784 +1,784 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Vectorizers\n", - "\n", - "In this notebook, we will show how to use RedisVL to create embeddings using the built-in text embedding vectorizers. Today RedisVL supports:\n", - "1. OpenAI\n", - "2. HuggingFace\n", - "3. Vertex AI\n", - "4. Cohere\n", - "5. Mistral AI\n", - "6. Amazon Bedrock\n", - "7. Bringing your own vectorizer\n", - "8. VoyageAI\n", - "\n", - "Before running this notebook, be sure to\n", - "1. Have installed ``redisvl`` and have that environment active for this notebook.\n", - "2. Have a running Redis Stack instance with RediSearch > 2.4 active.\n", - "\n", - "For example, you can run Redis Stack locally with Docker:\n", - "\n", - "```bash\n", - "docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest\n", - "```\n", - "\n", - "This will run Redis on port 6379 and RedisInsight at http://localhost:8001." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# import necessary modules\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Text Embeddings\n", - "\n", - "This example will show how to create an embedding from 3 simple sentences with a number of different text vectorizers in RedisVL.\n", - "\n", - "- \"That is a happy dog\"\n", - "- \"That is a happy person\"\n", - "- \"Today is a nice day\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### OpenAI\n", - "\n", - "The ``OpenAITextVectorizer`` makes it simple to use RedisVL with the embeddings models at OpenAI. For this you will need to install ``openai``. \n", - "\n", - "```bash\n", - "pip install openai\n", - "```\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "\n", - "# setup the API Key\n", - "api_key = os.environ.get(\"OPENAI_API_KEY\") or getpass.getpass(\"Enter your OpenAI API key: \")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Vector dimensions: 1536\n" - ] - }, - { - "data": { - "text/plain": [ - "[-0.0011391325388103724,\n", - " -0.003206387162208557,\n", - " 0.002380132209509611,\n", - " -0.004501554183661938,\n", - " -0.010328996926546097,\n", - " 0.012922565452754498,\n", - " -0.005491119809448719,\n", - " -0.0029864837415516376,\n", - " -0.007327961269766092,\n", - " -0.03365817293524742]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from redisvl.utils.vectorize import OpenAITextVectorizer\n", - "\n", - "# create a vectorizer\n", - "oai = OpenAITextVectorizer(\n", - " model=\"text-embedding-ada-002\",\n", - " api_config={\"api_key\": api_key},\n", - ")\n", - "\n", - "test = oai.embed(\"This is a test sentence.\")\n", - "print(\"Vector dimensions: \", len(test))\n", - "test[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[-0.017466850578784943,\n", - " 1.8471690054866485e-05,\n", - " 0.00129731057677418,\n", - " -0.02555876597762108,\n", - " -0.019842341542243958,\n", - " 0.01603139191865921,\n", - " -0.0037347301840782166,\n", - " 0.0009670283179730177,\n", - " 0.006618348415941,\n", - " -0.02497442066669464]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create many embeddings at once\n", - "sentences = [\n", - " \"That is a happy dog\",\n", - " \"That is a happy person\",\n", - " \"Today is a sunny day\"\n", - "]\n", - "\n", - "embeddings = oai.embed_many(sentences)\n", - "embeddings[0][:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of Embeddings: 3\n" - ] - } - ], - "source": [ - "# openai also supports asynchronous requests, which we can use to speed up the vectorization process.\n", - "embeddings = await oai.aembed_many(sentences)\n", - "print(\"Number of Embeddings:\", len(embeddings))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Azure OpenAI\n", - "\n", - "The ``AzureOpenAITextVectorizer`` is a variation of the OpenAI vectorizer that calls OpenAI models within Azure. If you've already installed ``openai``, then you're ready to use Azure OpenAI.\n", - "\n", - "The only practical difference between OpenAI and Azure OpenAI is the variables required to call the API." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# additionally to the API Key, setup the API endpoint and version\n", - "api_key = os.environ.get(\"AZURE_OPENAI_API_KEY\") or getpass.getpass(\"Enter your AzureOpenAI API key: \")\n", - "api_version = os.environ.get(\"OPENAI_API_VERSION\") or getpass.getpass(\"Enter your AzureOpenAI API version: \")\n", - "azure_endpoint = os.environ.get(\"AZURE_OPENAI_ENDPOINT\") or getpass.getpass(\"Enter your AzureOpenAI API endpoint: \")\n", - "deployment_name = os.environ.get(\"AZURE_OPENAI_DEPLOYMENT_NAME\", \"text-embedding-ada-002\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "AzureOpenAI API endpoint is required. Provide it in api_config or set the AZURE_OPENAI_ENDPOINT environment variable.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mredisvl\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvectorize\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m AzureOpenAITextVectorizer\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# create a vectorizer\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m az_oai \u001b[38;5;241m=\u001b[39m \u001b[43mAzureOpenAITextVectorizer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdeployment_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Must be your CUSTOM deployment name\u001b[39;49;00m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mapi_key\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mapi_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mapi_version\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mapi_version\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mazure_endpoint\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mazure_endpoint\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m test \u001b[38;5;241m=\u001b[39m az_oai\u001b[38;5;241m.\u001b[39membed(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis is a test sentence.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVector dimensions: \u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mlen\u001b[39m(test))\n", - "File \u001b[0;32m~/src/redis-vl-python/redisvl/utils/vectorize/text/azureopenai.py:78\u001b[0m, in \u001b[0;36mAzureOpenAITextVectorizer.__init__\u001b[0;34m(self, model, api_config, dtype)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 56\u001b[0m model: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext-embedding-ada-002\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 57\u001b[0m api_config: Optional[Dict] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 58\u001b[0m dtype: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfloat32\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 59\u001b[0m ):\n\u001b[1;32m 60\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Initialize the AzureOpenAI vectorizer.\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \n\u001b[1;32m 62\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;124;03m ValueError: If an invalid dtype is provided.\u001b[39;00m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 78\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_initialize_clients\u001b[49m\u001b[43m(\u001b[49m\u001b[43mapi_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 79\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(model\u001b[38;5;241m=\u001b[39mmodel, dims\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_model_dims(model), dtype\u001b[38;5;241m=\u001b[39mdtype)\n", - "File \u001b[0;32m~/src/redis-vl-python/redisvl/utils/vectorize/text/azureopenai.py:106\u001b[0m, in \u001b[0;36mAzureOpenAITextVectorizer._initialize_clients\u001b[0;34m(self, api_config)\u001b[0m\n\u001b[1;32m 99\u001b[0m azure_endpoint \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 100\u001b[0m api_config\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mazure_endpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m api_config\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m os\u001b[38;5;241m.\u001b[39mgetenv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAZURE_OPENAI_ENDPOINT\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 103\u001b[0m )\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m azure_endpoint:\n\u001b[0;32m--> 106\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 107\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAzureOpenAI API endpoint is required. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mProvide it in api_config or set the AZURE_OPENAI_ENDPOINT\u001b[39m\u001b[38;5;130;01m\\\u001b[39;00m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124m environment variable.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 110\u001b[0m )\n\u001b[1;32m 112\u001b[0m api_version \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 113\u001b[0m api_config\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapi_version\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m api_config\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m os\u001b[38;5;241m.\u001b[39mgetenv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOPENAI_API_VERSION\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 116\u001b[0m )\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m api_version:\n", - "\u001b[0;31mValueError\u001b[0m: AzureOpenAI API endpoint is required. Provide it in api_config or set the AZURE_OPENAI_ENDPOINT environment variable." - ] - } - ], - "source": [ - "from redisvl.utils.vectorize import AzureOpenAITextVectorizer\n", - "\n", - "# create a vectorizer\n", - "az_oai = AzureOpenAITextVectorizer(\n", - " model=deployment_name, # Must be your CUSTOM deployment name\n", - " api_config={\n", - " \"api_key\": api_key,\n", - " \"api_version\": api_version,\n", - " \"azure_endpoint\": azure_endpoint\n", - " },\n", - ")\n", - "\n", - "test = az_oai.embed(\"This is a test sentence.\")\n", - "print(\"Vector dimensions: \", len(test))\n", - "test[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Just like OpenAI, AzureOpenAI supports batching embeddings and asynchronous requests.\n", - "sentences = [\n", - " \"That is a happy dog\",\n", - " \"That is a happy person\",\n", - " \"Today is a sunny day\"\n", - "]\n", - "\n", - "embeddings = await az_oai.aembed_many(sentences)\n", - "embeddings[0][:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Huggingface\n", - "\n", - "[Huggingface](https://huggingface.co/models) is a popular NLP platform that has a number of pre-trained models you can use off the shelf. RedisVL supports using Huggingface \"Sentence Transformers\" to create embeddings from text. To use Huggingface, you will need to install the ``sentence-transformers`` library.\n", - "\n", - "```bash\n", - "pip install sentence-transformers\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", - "from redisvl.utils.vectorize import HFTextVectorizer\n", - "\n", - "\n", - "# create a vectorizer\n", - "# choose your model from the huggingface website\n", - "hf = HFTextVectorizer(model=\"sentence-transformers/all-mpnet-base-v2\")\n", - "\n", - "# embed a sentence\n", - "test = hf.embed(\"This is a test sentence.\")\n", - "test[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# You can also create many embeddings at once\n", - "embeddings = hf.embed_many(sentences, as_buffer=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### VertexAI\n", - "\n", - "[VertexAI](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) is GCP's fully-featured AI platform including a number of pretrained LLMs. RedisVL supports using VertexAI to create embeddings from these models. To use VertexAI, you will first need to install the ``google-cloud-aiplatform`` library.\n", - "\n", - "```bash\n", - "pip install google-cloud-aiplatform>=1.26\n", - "```\n", - "\n", - "1. Then you need to gain access to a [Google Cloud Project](https://cloud.google.com/gcp?hl=en) and provide [access to credentials](https://cloud.google.com/docs/authentication/application-default-credentials). This is accomplished by setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable pointing to the path of a JSON key file downloaded from your service account on GCP.\n", - "2. Lastly, you need to find your [project ID](https://support.google.com/googleapi/answer/7014113?hl=en) and [geographic region for VertexAI](https://cloud.google.com/vertex-ai/docs/general/locations).\n", - "\n", - "\n", - "**Make sure the following env vars are set:**\n", - "\n", - "```\n", - "GOOGLE_APPLICATION_CREDENTIALS=\n", - "GCP_PROJECT_ID=\n", - "GCP_LOCATION=\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.utils.vectorize import VertexAITextVectorizer\n", - "\n", - "\n", - "# create a vectorizer\n", - "vtx = VertexAITextVectorizer(api_config={\n", - " \"project_id\": os.environ.get(\"GCP_PROJECT_ID\") or getpass.getpass(\"Enter your GCP Project ID: \"),\n", - " \"location\": os.environ.get(\"GCP_LOCATION\") or getpass.getpass(\"Enter your GCP Location: \"),\n", - " \"google_application_credentials\": os.environ.get(\"GOOGLE_APPLICATION_CREDENTIALS\") or getpass.getpass(\"Enter your Google App Credentials path: \")\n", - "})\n", - "\n", - "# embed a sentence\n", - "test = vtx.embed(\"This is a test sentence.\")\n", - "test[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cohere\n", - "\n", - "[Cohere](https://dashboard.cohere.ai/) allows you to implement language AI into your product. The `CohereTextVectorizer` makes it simple to use RedisVL with the embeddings models at Cohere. For this you will need to install `cohere`.\n", - "\n", - "```bash\n", - "pip install cohere\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "# setup the API Key\n", - "api_key = os.environ.get(\"COHERE_API_KEY\") or getpass.getpass(\"Enter your Cohere API key: \")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Special attention needs to be paid to the `input_type` parameter for each `embed` call. For example, for embedding \n", - "queries, you should set `input_type='search_query'`; for embedding documents, set `input_type='search_document'`. See\n", - "more information [here](https://docs.cohere.com/reference/embed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.utils.vectorize import CohereTextVectorizer\n", - "\n", - "# create a vectorizer\n", - "co = CohereTextVectorizer(\n", - " model=\"embed-english-v3.0\",\n", - " api_config={\"api_key\": api_key},\n", - ")\n", - "\n", - "# embed a search query\n", - "test = co.embed(\"This is a test sentence.\", input_type='search_query')\n", - "print(\"Vector dimensions: \", len(test))\n", - "print(test[:10])\n", - "\n", - "# embed a document\n", - "test = co.embed(\"This is a test sentence.\", input_type='search_document')\n", - "print(\"Vector dimensions: \", len(test))\n", - "print(test[:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Learn more about using RedisVL and Cohere together through [this dedicated user guide](https://docs.cohere.com/docs/redis-and-cohere)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### VoyageAI\n", - "\n", - "[VoyageAI](https://dash.voyageai.com/) allows you to implement language AI into your product. The `VoyageAITextVectorizer` makes it simple to use RedisVL with the embeddings models at VoyageAI. For this you will need to install `voyageai`.\n", - "\n", - "```bash\n", - "pip install voyageai\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "# setup the API Key\n", - "api_key = os.environ.get(\"VOYAGE_API_KEY\") or getpass.getpass(\"Enter your VoyageAI API key: \")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Special attention needs to be paid to the `input_type` parameter for each `embed` call. For example, for embedding \n", - "queries, you should set `input_type='query'`; for embedding documents, set `input_type='document'`. See\n", - "more information [here](https://docs.voyageai.com/docs/embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.utils.vectorize import VoyageAITextVectorizer\n", - "\n", - "# create a vectorizer\n", - "vo = VoyageAITextVectorizer(\n", - " model=\"voyage-law-2\", # Please check the available models at https://docs.voyageai.com/docs/embeddings\n", - " api_config={\"api_key\": api_key},\n", - ")\n", - "\n", - "# embed a search query\n", - "test = vo.embed(\"This is a test sentence.\", input_type='query')\n", - "print(\"Vector dimensions: \", len(test))\n", - "print(test[:10])\n", - "\n", - "# embed a document\n", - "test = vo.embed(\"This is a test sentence.\", input_type='document')\n", - "print(\"Vector dimensions: \", len(test))\n", - "print(test[:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Mistral AI\n", - "\n", - "[Mistral](https://console.mistral.ai/) offers LLM and embedding APIs for you to implement into your product. The `MistralAITextVectorizer` makes it simple to use RedisVL with their embeddings model.\n", - "You will need to install `mistralai`.\n", - "\n", - "```bash\n", - "pip install mistralai\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.utils.vectorize import MistralAITextVectorizer\n", - "\n", - "mistral = MistralAITextVectorizer()\n", - "\n", - "# embed a sentence using their asynchronous method\n", - "test = await mistral.aembed(\"This is a test sentence.\")\n", - "print(\"Vector dimensions: \", len(test))\n", - "print(test[:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Amazon Bedrock\n", - "\n", - "Amazon Bedrock provides fully managed foundation models for text embeddings. Install the required dependencies:\n", - "\n", - "```bash\n", - "pip install 'redisvl[bedrock]' # Installs boto3\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Configure AWS credentials:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import getpass\n", - "\n", - "if \"AWS_ACCESS_KEY_ID\" not in os.environ:\n", - " os.environ[\"AWS_ACCESS_KEY_ID\"] = getpass.getpass(\"Enter AWS Access Key ID: \")\n", - "if \"AWS_SECRET_ACCESS_KEY\" not in os.environ:\n", - " os.environ[\"AWS_SECRET_ACCESS_KEY\"] = getpass.getpass(\"Enter AWS Secret Key: \")\n", - "\n", - "os.environ[\"AWS_REGION\"] = \"us-east-1\" # Change as needed" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Create embeddings:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.utils.vectorize import BedrockTextVectorizer\n", - "\n", - "bedrock = BedrockTextVectorizer(\n", - " model=\"amazon.titan-embed-text-v2:0\"\n", - ")\n", - "\n", - "# Single embedding\n", - "text = \"This is a test sentence.\"\n", - "embedding = bedrock.embed(text)\n", - "print(f\"Vector dimensions: {len(embedding)}\")\n", - "\n", - "# Multiple embeddings\n", - "sentences = [\n", - " \"That is a happy dog\",\n", - " \"That is a happy person\",\n", - " \"Today is a sunny day\"\n", - "]\n", - "embeddings = bedrock.embed_many(sentences)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Custom Vectorizers\n", - "\n", - "RedisVL supports the use of other vectorizers and provides a class to enable compatibility with any function that generates a vector or vectors from string data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.utils.vectorize import CustomTextVectorizer\n", - "\n", - "def generate_embeddings(text_input, **kwargs):\n", - " return [0.101] * 768\n", - "\n", - "custom_vectorizer = CustomTextVectorizer(generate_embeddings)\n", - "\n", - "custom_vectorizer.embed(\"This is a test sentence.\")[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This enables the use of custom vectorizers with other RedisVL components" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.extensions.cache.llm import SemanticCache\n", - "\n", - "cache = SemanticCache(name=\"custom_cache\", vectorizer=custom_vectorizer)\n", - "\n", - "cache.store(\"this is a test prompt\", \"this is a test response\")\n", - "cache.check(\"this is also a test prompt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Search with Provider Embeddings\n", - "\n", - "Now that we've created our embeddings, we can use them to search for similar sentences. We will use the same 3 sentences from above and search for similar sentences.\n", - "\n", - "First, we need to create the schema for our index.\n", - "\n", - "Here's what the schema for the example looks like in yaml for the HuggingFace vectorizer:\n", - "\n", - "```yaml\n", - "version: '0.1.0'\n", - "\n", - "index:\n", - " name: vectorizers\n", - " prefix: doc\n", - " storage_type: hash\n", - "\n", - "fields:\n", - " - name: sentence\n", - " type: text\n", - " - name: embedding\n", - " type: vector\n", - " attrs:\n", - " dims: 768\n", - " algorithm: flat\n", - " distance_metric: cosine\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.index import SearchIndex\n", - "\n", - "# construct a search index from the schema\n", - "index = SearchIndex.from_yaml(\"./schema.yaml\", redis_url=\"redis://localhost:6379\")\n", - "\n", - "# create the index (no data yet)\n", - "index.create(overwrite=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use the CLI to see the created index\n", - "!rvl index listall" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Loading data to RedisVL is easy. It expects a list of dictionaries. The vector is stored as bytes." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.redis.utils import array_to_buffer\n", - "\n", - "embeddings = hf.embed_many(sentences)\n", - "\n", - "data = [{\"text\": t,\n", - " \"embedding\": array_to_buffer(v, dtype=\"float32\")}\n", - " for t, v in zip(sentences, embeddings)]\n", - "\n", - "index.load(data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.query import VectorQuery\n", - "\n", - "# use the HuggingFace vectorizer again to create a query embedding\n", - "query_embedding = hf.embed(\"That is a happy cat\")\n", - "\n", - "query = VectorQuery(\n", - " vector=query_embedding,\n", - " vector_field_name=\"embedding\",\n", - " return_fields=[\"text\"],\n", - " num_results=3\n", - ")\n", - "\n", - "results = index.query(query)\n", - "for doc in results:\n", - " print(doc[\"text\"], doc[\"vector_distance\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Selecting your float data type\n", - "When embedding text as byte arrays RedisVL supports 4 different floating point data types, `float16`, `float32`, `float64` and `bfloat16`, and 2 integer types, `int8` and `uint8`.\n", - "Your dtype set for your vectorizer must match what is defined in your search index. If one is not explicitly set the default is `float32`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vectorizer = HFTextVectorizer(dtype=\"float16\")\n", - "\n", - "# subsequent calls to embed('', as_buffer=True) and embed_many('', as_buffer=True) will now encode as float16\n", - "float16_bytes = vectorizer.embed('test sentence', as_buffer=True)\n", - "\n", - "# to generate embeddings with different dtype instantiate a new vectorizer\n", - "vectorizer_64 = HFTextVectorizer(dtype='float64')\n", - "float64_bytes = vectorizer_64.embed('test sentence', as_buffer=True)\n", - "\n", - "float16_bytes != float64_bytes" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# cleanup\n", - "index.delete()" - ] - } + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Vectorizers\n", + "\n", + "In this notebook, we will show how to use RedisVL to create embeddings using the built-in text embedding vectorizers. Today RedisVL supports:\n", + "1. OpenAI\n", + "2. HuggingFace\n", + "3. Vertex AI\n", + "4. Cohere\n", + "5. Mistral AI\n", + "6. Amazon Bedrock\n", + "7. Bringing your own vectorizer\n", + "8. VoyageAI\n", + "\n", + "Before running this notebook, be sure to\n", + "1. Have installed ``redisvl`` and have that environment active for this notebook.\n", + "2. Have a running Redis Stack instance with RediSearch > 2.4 active.\n", + "\n", + "For example, you can run Redis Stack locally with Docker:\n", + "\n", + "```bash\n", + "docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest\n", + "```\n", + "\n", + "This will run Redis on port 6379 and RedisInsight at http://localhost:8001." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import necessary modules\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating Text Embeddings\n", + "\n", + "This example will show how to create an embedding from 3 simple sentences with a number of different text vectorizers in RedisVL.\n", + "\n", + "- \"That is a happy dog\"\n", + "- \"That is a happy person\"\n", + "- \"Today is a nice day\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### OpenAI\n", + "\n", + "The ``OpenAITextVectorizer`` makes it simple to use RedisVL with the embeddings models at OpenAI. For this you will need to install ``openai``. \n", + "\n", + "```bash\n", + "pip install openai\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "\n", + "# setup the API Key\n", + "api_key = os.environ.get(\"OPENAI_API_KEY\") or getpass.getpass(\"Enter your OpenAI API key: \")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vector dimensions: 1536\n" + ] + }, + { + "data": { + "text/plain": [ + "[-0.0011391325388103724,\n", + " -0.003206387162208557,\n", + " 0.002380132209509611,\n", + " -0.004501554183661938,\n", + " -0.010328996926546097,\n", + " 0.012922565452754498,\n", + " -0.005491119809448719,\n", + " -0.0029864837415516376,\n", + " -0.007327961269766092,\n", + " -0.03365817293524742]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.2" - }, - "orig_nbformat": 4 + "source": [ + "from redisvl.utils.vectorize import OpenAITextVectorizer\n", + "\n", + "# create a vectorizer\n", + "oai = OpenAITextVectorizer(\n", + " model=\"text-embedding-ada-002\",\n", + " api_config={\"api_key\": api_key},\n", + ")\n", + "\n", + "test = oai.embed(\"This is a test sentence.\")\n", + "print(\"Vector dimensions: \", len(test))\n", + "test[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-0.017466850578784943,\n", + " 1.8471690054866485e-05,\n", + " 0.00129731057677418,\n", + " -0.02555876597762108,\n", + " -0.019842341542243958,\n", + " 0.01603139191865921,\n", + " -0.0037347301840782166,\n", + " 0.0009670283179730177,\n", + " 0.006618348415941,\n", + " -0.02497442066669464]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create many embeddings at once\n", + "sentences = [\n", + " \"That is a happy dog\",\n", + " \"That is a happy person\",\n", + " \"Today is a sunny day\"\n", + "]\n", + "\n", + "embeddings = oai.embed_many(sentences)\n", + "embeddings[0][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of Embeddings: 3\n" + ] + } + ], + "source": [ + "# openai also supports asynchronous requests, which we can use to speed up the vectorization process.\n", + "embeddings = await oai.aembed_many(sentences)\n", + "print(\"Number of Embeddings:\", len(embeddings))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure OpenAI\n", + "\n", + "The ``AzureOpenAITextVectorizer`` is a variation of the OpenAI vectorizer that calls OpenAI models within Azure. If you've already installed ``openai``, then you're ready to use Azure OpenAI.\n", + "\n", + "The only practical difference between OpenAI and Azure OpenAI is the variables required to call the API." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# additionally to the API Key, setup the API endpoint and version\n", + "api_key = os.environ.get(\"AZURE_OPENAI_API_KEY\") or getpass.getpass(\"Enter your AzureOpenAI API key: \")\n", + "api_version = os.environ.get(\"OPENAI_API_VERSION\") or getpass.getpass(\"Enter your AzureOpenAI API version: \")\n", + "azure_endpoint = os.environ.get(\"AZURE_OPENAI_ENDPOINT\") or getpass.getpass(\"Enter your AzureOpenAI API endpoint: \")\n", + "deployment_name = os.environ.get(\"AZURE_OPENAI_DEPLOYMENT_NAME\", \"text-embedding-ada-002\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "AzureOpenAI API endpoint is required. Provide it in api_config or set the AZURE_OPENAI_ENDPOINT environment variable.", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[7], line 4\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21;01mredisvl\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mvectorize\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m AzureOpenAITextVectorizer\n\u001B[1;32m 3\u001B[0m \u001B[38;5;66;03m# create a vectorizer\u001B[39;00m\n\u001B[0;32m----> 4\u001B[0m az_oai \u001B[38;5;241m=\u001B[39m \u001B[43mAzureOpenAITextVectorizer\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 5\u001B[0m \u001B[43m \u001B[49m\u001B[43mmodel\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdeployment_name\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;66;43;03m# Must be your CUSTOM deployment name\u001B[39;49;00m\n\u001B[1;32m 6\u001B[0m \u001B[43m \u001B[49m\u001B[43mapi_config\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m{\u001B[49m\n\u001B[1;32m 7\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mapi_key\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43mapi_key\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 8\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mapi_version\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43mapi_version\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 9\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mazure_endpoint\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43mazure_endpoint\u001B[49m\n\u001B[1;32m 10\u001B[0m \u001B[43m \u001B[49m\u001B[43m}\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 11\u001B[0m \u001B[43m)\u001B[49m\n\u001B[1;32m 13\u001B[0m test \u001B[38;5;241m=\u001B[39m az_oai\u001B[38;5;241m.\u001B[39membed(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThis is a test sentence.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 14\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mVector dimensions: \u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;28mlen\u001B[39m(test))\n", + "File \u001B[0;32m~/src/redis-vl-python/redisvl/utils/vectorize/text/azureopenai.py:78\u001B[0m, in \u001B[0;36mAzureOpenAITextVectorizer.__init__\u001B[0;34m(self, model, api_config, dtype)\u001B[0m\n\u001B[1;32m 54\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21m__init__\u001B[39m(\n\u001B[1;32m 55\u001B[0m \u001B[38;5;28mself\u001B[39m,\n\u001B[1;32m 56\u001B[0m model: \u001B[38;5;28mstr\u001B[39m \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtext-embedding-ada-002\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 57\u001B[0m api_config: Optional[Dict] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m 58\u001B[0m dtype: \u001B[38;5;28mstr\u001B[39m \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mfloat32\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 59\u001B[0m ):\n\u001B[1;32m 60\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"Initialize the AzureOpenAI vectorizer.\u001B[39;00m\n\u001B[1;32m 61\u001B[0m \n\u001B[1;32m 62\u001B[0m \u001B[38;5;124;03m Args:\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 76\u001B[0m \u001B[38;5;124;03m ValueError: If an invalid dtype is provided.\u001B[39;00m\n\u001B[1;32m 77\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m---> 78\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_initialize_clients\u001B[49m\u001B[43m(\u001B[49m\u001B[43mapi_config\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 79\u001B[0m \u001B[38;5;28msuper\u001B[39m()\u001B[38;5;241m.\u001B[39m\u001B[38;5;21m__init__\u001B[39m(model\u001B[38;5;241m=\u001B[39mmodel, dims\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_set_model_dims(model), dtype\u001B[38;5;241m=\u001B[39mdtype)\n", + "File \u001B[0;32m~/src/redis-vl-python/redisvl/utils/vectorize/text/azureopenai.py:106\u001B[0m, in \u001B[0;36mAzureOpenAITextVectorizer._initialize_clients\u001B[0;34m(self, api_config)\u001B[0m\n\u001B[1;32m 99\u001B[0m azure_endpoint \u001B[38;5;241m=\u001B[39m (\n\u001B[1;32m 100\u001B[0m api_config\u001B[38;5;241m.\u001B[39mpop(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mazure_endpoint\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 101\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m api_config\n\u001B[1;32m 102\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m os\u001B[38;5;241m.\u001B[39mgetenv(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAZURE_OPENAI_ENDPOINT\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 103\u001B[0m )\n\u001B[1;32m 105\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m azure_endpoint:\n\u001B[0;32m--> 106\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m 107\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAzureOpenAI API endpoint is required. \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 108\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mProvide it in api_config or set the AZURE_OPENAI_ENDPOINT\u001B[39m\u001B[38;5;130;01m\\\u001B[39;00m\n\u001B[1;32m 109\u001B[0m \u001B[38;5;124m environment variable.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 110\u001B[0m )\n\u001B[1;32m 112\u001B[0m api_version \u001B[38;5;241m=\u001B[39m (\n\u001B[1;32m 113\u001B[0m api_config\u001B[38;5;241m.\u001B[39mpop(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mapi_version\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 114\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m api_config\n\u001B[1;32m 115\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m os\u001B[38;5;241m.\u001B[39mgetenv(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mOPENAI_API_VERSION\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 116\u001B[0m )\n\u001B[1;32m 118\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m api_version:\n", + "\u001B[0;31mValueError\u001B[0m: AzureOpenAI API endpoint is required. Provide it in api_config or set the AZURE_OPENAI_ENDPOINT environment variable." + ] + } + ], + "source": [ + "from redisvl.utils.vectorize import AzureOpenAITextVectorizer\n", + "\n", + "# create a vectorizer\n", + "az_oai = AzureOpenAITextVectorizer(\n", + " model=deployment_name, # Must be your CUSTOM deployment name\n", + " api_config={\n", + " \"api_key\": api_key,\n", + " \"api_version\": api_version,\n", + " \"azure_endpoint\": azure_endpoint\n", + " },\n", + ")\n", + "\n", + "test = az_oai.embed(\"This is a test sentence.\")\n", + "print(\"Vector dimensions: \", len(test))\n", + "test[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Just like OpenAI, AzureOpenAI supports batching embeddings and asynchronous requests.\n", + "sentences = [\n", + " \"That is a happy dog\",\n", + " \"That is a happy person\",\n", + " \"Today is a sunny day\"\n", + "]\n", + "\n", + "embeddings = await az_oai.aembed_many(sentences)\n", + "embeddings[0][:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Huggingface\n", + "\n", + "[Huggingface](https://huggingface.co/models) is a popular NLP platform that has a number of pre-trained models you can use off the shelf. RedisVL supports using Huggingface \"Sentence Transformers\" to create embeddings from text. To use Huggingface, you will need to install the ``sentence-transformers`` library.\n", + "\n", + "```bash\n", + "pip install sentence-transformers\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "from redisvl.utils.vectorize import HFTextVectorizer\n", + "\n", + "\n", + "# create a vectorizer\n", + "# choose your model from the huggingface website\n", + "hf = HFTextVectorizer(model=\"sentence-transformers/all-mpnet-base-v2\")\n", + "\n", + "# embed a sentence\n", + "test = hf.embed(\"This is a test sentence.\")\n", + "test[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also create many embeddings at once\n", + "embeddings = hf.embed_many(sentences, as_buffer=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### VertexAI\n", + "\n", + "[VertexAI](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) is GCP's fully-featured AI platform including a number of pretrained LLMs. RedisVL supports using VertexAI to create embeddings from these models. To use VertexAI, you will first need to install the ``google-cloud-aiplatform`` library.\n", + "\n", + "```bash\n", + "pip install google-cloud-aiplatform>=1.26\n", + "```\n", + "\n", + "1. Then you need to gain access to a [Google Cloud Project](https://cloud.google.com/gcp?hl=en) and provide [access to credentials](https://cloud.google.com/docs/authentication/application-default-credentials). This is accomplished by setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable pointing to the path of a JSON key file downloaded from your service account on GCP.\n", + "2. Lastly, you need to find your [project ID](https://support.google.com/googleapi/answer/7014113?hl=en) and [geographic region for VertexAI](https://cloud.google.com/vertex-ai/docs/general/locations).\n", + "\n", + "\n", + "**Make sure the following env vars are set:**\n", + "\n", + "```\n", + "GOOGLE_APPLICATION_CREDENTIALS=\n", + "GCP_PROJECT_ID=\n", + "GCP_LOCATION=\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.utils.vectorize import VertexAIVectorizer\n", + "\n", + "\n", + "# create a vectorizer\n", + "vtx = VertexAIVectorizer(api_config={\n", + " \"project_id\": os.environ.get(\"GCP_PROJECT_ID\") or getpass.getpass(\"Enter your GCP Project ID: \"),\n", + " \"location\": os.environ.get(\"GCP_LOCATION\") or getpass.getpass(\"Enter your GCP Location: \"),\n", + " \"google_application_credentials\": os.environ.get(\"GOOGLE_APPLICATION_CREDENTIALS\") or getpass.getpass(\"Enter your Google App Credentials path: \")\n", + "})\n", + "\n", + "# embed a sentence\n", + "test = vtx.embed(\"This is a test sentence.\")\n", + "test[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cohere\n", + "\n", + "[Cohere](https://dashboard.cohere.ai/) allows you to implement language AI into your product. The `CohereTextVectorizer` makes it simple to use RedisVL with the embeddings models at Cohere. For this you will need to install `cohere`.\n", + "\n", + "```bash\n", + "pip install cohere\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "# setup the API Key\n", + "api_key = os.environ.get(\"COHERE_API_KEY\") or getpass.getpass(\"Enter your Cohere API key: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Special attention needs to be paid to the `input_type` parameter for each `embed` call. For example, for embedding \n", + "queries, you should set `input_type='search_query'`; for embedding documents, set `input_type='search_document'`. See\n", + "more information [here](https://docs.cohere.com/reference/embed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.utils.vectorize import CohereTextVectorizer\n", + "\n", + "# create a vectorizer\n", + "co = CohereTextVectorizer(\n", + " model=\"embed-english-v3.0\",\n", + " api_config={\"api_key\": api_key},\n", + ")\n", + "\n", + "# embed a search query\n", + "test = co.embed(\"This is a test sentence.\", input_type='search_query')\n", + "print(\"Vector dimensions: \", len(test))\n", + "print(test[:10])\n", + "\n", + "# embed a document\n", + "test = co.embed(\"This is a test sentence.\", input_type='search_document')\n", + "print(\"Vector dimensions: \", len(test))\n", + "print(test[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Learn more about using RedisVL and Cohere together through [this dedicated user guide](https://docs.cohere.com/docs/redis-and-cohere)." + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### VoyageAI\n", + "\n", + "[VoyageAI](https://dash.voyageai.com/) allows you to implement language AI into your product. The `VoyageAIVectorizer` makes it simple to use RedisVL with the embeddings models at VoyageAI. For this you will need to install `voyageai`.\n", + "\n", + "```bash\n", + "pip install voyageai\n", + "```" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "import getpass\n", + "# setup the API Key\n", + "api_key = os.environ.get(\"VOYAGE_API_KEY\") or getpass.getpass(\"Enter your VoyageAI API key: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Special attention needs to be paid to the `input_type` parameter for each `embed` call. For example, for embedding \n", + "queries, you should set `input_type='query'`; for embedding documents, set `input_type='document'`. See\n", + "more information [here](https://docs.voyageai.com/docs/embeddings)" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from redisvl.utils.vectorize import VoyageAIVectorizer\n", + "\n", + "# create a vectorizer\n", + "vo = VoyageAIVectorizer(\n", + " model=\"voyage-law-2\", # Please check the available models at https://docs.voyageai.com/docs/embeddings\n", + " api_config={\"api_key\": api_key},\n", + ")\n", + "\n", + "# embed a search query\n", + "test = vo.embed(\"This is a test sentence.\", input_type='query')\n", + "print(\"Vector dimensions: \", len(test))\n", + "print(test[:10])\n", + "\n", + "# embed a document\n", + "test = vo.embed(\"This is a test sentence.\", input_type='document')\n", + "print(\"Vector dimensions: \", len(test))\n", + "print(test[:10])" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Mistral AI\n", + "\n", + "[Mistral](https://console.mistral.ai/) offers LLM and embedding APIs for you to implement into your product. The `MistralAITextVectorizer` makes it simple to use RedisVL with their embeddings model.\n", + "You will need to install `mistralai`.\n", + "\n", + "```bash\n", + "pip install mistralai\n", + "```" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from redisvl.utils.vectorize import MistralAITextVectorizer\n", + "\n", + "mistral = MistralAITextVectorizer()\n", + "\n", + "# embed a sentence using their asynchronous method\n", + "test = await mistral.aembed(\"This is a test sentence.\")\n", + "print(\"Vector dimensions: \", len(test))\n", + "print(test[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Amazon Bedrock\n", + "\n", + "Amazon Bedrock provides fully managed foundation models for text embeddings. Install the required dependencies:\n", + "\n", + "```bash\n", + "pip install 'redisvl[bedrock]' # Installs boto3\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Configure AWS credentials:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "if \"AWS_ACCESS_KEY_ID\" not in os.environ:\n", + " os.environ[\"AWS_ACCESS_KEY_ID\"] = getpass.getpass(\"Enter AWS Access Key ID: \")\n", + "if \"AWS_SECRET_ACCESS_KEY\" not in os.environ:\n", + " os.environ[\"AWS_SECRET_ACCESS_KEY\"] = getpass.getpass(\"Enter AWS Secret Key: \")\n", + "\n", + "os.environ[\"AWS_REGION\"] = \"us-east-1\" # Change as needed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create embeddings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.utils.vectorize import BedrockVectorizer\n", + "\n", + "bedrock = BedrockVectorizer(\n", + " model=\"amazon.titan-embed-text-v2:0\"\n", + ")\n", + "\n", + "# Single embedding\n", + "text = \"This is a test sentence.\"\n", + "embedding = bedrock.embed(text)\n", + "print(f\"Vector dimensions: {len(embedding)}\")\n", + "\n", + "# Multiple embeddings\n", + "sentences = [\n", + " \"That is a happy dog\",\n", + " \"That is a happy person\",\n", + " \"Today is a sunny day\"\n", + "]\n", + "embeddings = bedrock.embed_many(sentences)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom Vectorizers\n", + "\n", + "RedisVL supports the use of other vectorizers and provides a class to enable compatibility with any function that generates a vector or vectors from string data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.utils.vectorize import CustomVectorizer\n", + "\n", + "def generate_embeddings(text_input, **kwargs):\n", + " return [0.101] * 768\n", + "\n", + "custom_vectorizer = CustomVectorizer(generate_embeddings)\n", + "\n", + "custom_vectorizer.embed(\"This is a test sentence.\")[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This enables the use of custom vectorizers with other RedisVL components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.extensions.cache.llm import SemanticCache\n", + "\n", + "cache = SemanticCache(name=\"custom_cache\", vectorizer=custom_vectorizer)\n", + "\n", + "cache.store(\"this is a test prompt\", \"this is a test response\")\n", + "cache.check(\"this is also a test prompt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search with Provider Embeddings\n", + "\n", + "Now that we've created our embeddings, we can use them to search for similar sentences. We will use the same 3 sentences from above and search for similar sentences.\n", + "\n", + "First, we need to create the schema for our index.\n", + "\n", + "Here's what the schema for the example looks like in yaml for the HuggingFace vectorizer:\n", + "\n", + "```yaml\n", + "version: '0.1.0'\n", + "\n", + "index:\n", + " name: vectorizers\n", + " prefix: doc\n", + " storage_type: hash\n", + "\n", + "fields:\n", + " - name: sentence\n", + " type: text\n", + " - name: embedding\n", + " type: vector\n", + " attrs:\n", + " dims: 768\n", + " algorithm: flat\n", + " distance_metric: cosine\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.index import SearchIndex\n", + "\n", + "# construct a search index from the schema\n", + "index = SearchIndex.from_yaml(\"./schema.yaml\", redis_url=\"redis://localhost:6379\")\n", + "\n", + "# create the index (no data yet)\n", + "index.create(overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use the CLI to see the created index\n", + "!rvl index listall" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading data to RedisVL is easy. It expects a list of dictionaries. The vector is stored as bytes." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.redis.utils import array_to_buffer\n", + "\n", + "embeddings = hf.embed_many(sentences)\n", + "\n", + "data = [{\"text\": t,\n", + " \"embedding\": array_to_buffer(v, dtype=\"float32\")}\n", + " for t, v in zip(sentences, embeddings)]\n", + "\n", + "index.load(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.query import VectorQuery\n", + "\n", + "# use the HuggingFace vectorizer again to create a query embedding\n", + "query_embedding = hf.embed(\"That is a happy cat\")\n", + "\n", + "query = VectorQuery(\n", + " vector=query_embedding,\n", + " vector_field_name=\"embedding\",\n", + " return_fields=[\"text\"],\n", + " num_results=3\n", + ")\n", + "\n", + "results = index.query(query)\n", + "for doc in results:\n", + " print(doc[\"text\"], doc[\"vector_distance\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selecting your float data type\n", + "When embedding text as byte arrays RedisVL supports 4 different floating point data types, `float16`, `float32`, `float64` and `bfloat16`, and 2 integer types, `int8` and `uint8`.\n", + "Your dtype set for your vectorizer must match what is defined in your search index. If one is not explicitly set the default is `float32`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = HFTextVectorizer(dtype=\"float16\")\n", + "\n", + "# subsequent calls to embed('', as_buffer=True) and embed_many('', as_buffer=True) will now encode as float16\n", + "float16_bytes = vectorizer.embed('test sentence', as_buffer=True)\n", + "\n", + "# to generate embeddings with different dtype instantiate a new vectorizer\n", + "vectorizer_64 = HFTextVectorizer(dtype='float64')\n", + "float64_bytes = vectorizer_64.embed('test sentence', as_buffer=True)\n", + "\n", + "float16_bytes != float64_bytes" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# cleanup\n", + "index.delete()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 }, - "nbformat": 4, - "nbformat_minor": 2 + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/user_guide/10_embeddings_cache.ipynb b/docs/user_guide/10_embeddings_cache.ipynb index 265261df..d5cae6a9 100644 --- a/docs/user_guide/10_embeddings_cache.ipynb +++ b/docs/user_guide/10_embeddings_cache.ipynb @@ -25,10 +25,10 @@ ] }, { - "cell_type": "code", - "execution_count": 1, "metadata": {}, + "cell_type": "code", "outputs": [], + "execution_count": null, "source": [ "import os\n", "import time\n", @@ -43,45 +43,15 @@ ] }, { - "cell_type": "markdown", "metadata": {}, - "source": [ - "Let's create a vectorizer to generate embeddings for our texts:" - ] + "cell_type": "markdown", + "source": "Let's create a vectorizer to generate embeddings for our texts:" }, { + "metadata": {}, "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "13:06:09 sentence_transformers.SentenceTransformer INFO Use pytorch device_name: mps\n", - "13:06:09 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: redis/langcache-embed-v1\n", - "13:06:09 sentence_transformers.SentenceTransformer WARNING You try to use a model that was created with version 4.1.0, however, your version is 3.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.\n", - "\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 4.09it/s]\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Initialize the vectorizer\n", "vectorizer = HFTextVectorizer(\n", @@ -91,8 +61,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Initializing the EmbeddingsCache\n", "\n", @@ -100,10 +70,10 @@ ] }, { - "cell_type": "code", - "execution_count": 3, "metadata": {}, + "cell_type": "code", "outputs": [], + "execution_count": null, "source": [ "# Initialize the embeddings cache\n", "cache = EmbeddingsCache(\n", @@ -114,8 +84,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Basic Usage\n", "\n", @@ -130,32 +100,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 3.18it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored with key: embedcache:909f...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Text to embed\n", "text = \"What is machine learning?\"\n", @@ -169,7 +117,7 @@ "\n", "# Store in cache\n", "key = cache.set(\n", - " text=text,\n", + " content=text,\n", " model_name=model_name,\n", " embedding=embedding,\n", " metadata=metadata\n", @@ -179,8 +127,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "### Retrieving Embeddings\n", "\n", @@ -188,26 +136,15 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found in cache: What is machine learning?\n", - "Model: redis/langcache-embed-v1\n", - "Metadata: {'category': 'ai', 'source': 'user_query'}\n", - "Embedding shape: (768,)\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Retrieve from cache\n", "\n", - "if result := cache.get(text=text, model_name=model_name):\n", - " print(f\"Found in cache: {result['text']}\")\n", + "if result := cache.get(content=text, model_name=model_name):\n", + " print(f\"Found in cache: {result['content']}\")\n", " print(f\"Model: {result['model_name']}\")\n", " print(f\"Metadata: {result['metadata']}\")\n", " print(f\"Embedding shape: {np.array(result['embedding']).shape}\")\n", @@ -216,8 +153,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "### Checking Existence\n", "\n", @@ -225,33 +162,24 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "First query exists in cache: True\n", - "New query exists in cache: False\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Check if existing text is in cache\n", - "exists = cache.exists(text=text, model_name=model_name)\n", + "exists = cache.exists(content=text, model_name=model_name)\n", "print(f\"First query exists in cache: {exists}\")\n", "\n", "# Check if a new text is in cache\n", "new_text = \"What is deep learning?\"\n", - "exists = cache.exists(text=new_text, model_name=model_name)\n", + "exists = cache.exists(content=new_text, model_name=model_name)\n", "print(f\"New query exists in cache: {exists}\")" ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "### Removing Entries\n", "\n", @@ -259,30 +187,22 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "After dropping: False\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Remove from cache\n", - "cache.drop(text=text, model_name=model_name)\n", + "cache.drop(content=text, model_name=model_name)\n", "\n", "# Verify it's gone\n", - "exists = cache.exists(text=text, model_name=model_name)\n", + "exists = cache.exists(content=text, model_name=model_name)\n", "print(f\"After dropping: {exists}\")" ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Advanced Usage\n", "\n", @@ -292,24 +212,14 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored with key: embedcache:909f...\n", - "Exists by key: True\n", - "Retrieved by key: What is machine learning?\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Store an entry again\n", "key = cache.set(\n", - " text=text,\n", + " content=text,\n", " model_name=model_name,\n", " embedding=embedding,\n", " metadata=metadata\n", @@ -322,15 +232,15 @@ "\n", "# Retrieve by key\n", "result_by_key = cache.get_by_key(key)\n", - "print(f\"Retrieved by key: {result_by_key['text']}\")\n", + "print(f\"Retrieved by key: {result_by_key['content']}\")\n", "\n", "# Drop by key\n", "cache.drop_by_key(key)" ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "### Batch Operations\n", "\n", @@ -338,36 +248,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 21.37it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 9.04it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 20.84it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 3 embeddings with batch operation\n", - "All embeddings exist: True\n", - "Retrieved 3 embeddings in one operation\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Create multiple embeddings\n", "texts = [\n", @@ -380,19 +264,19 @@ "# Prepare batch items as dictionaries\n", "batch_items = [\n", " {\n", - " \"text\": texts[0],\n", + " \"content\": texts[0],\n", " \"model_name\": model_name,\n", " \"embedding\": embeddings[0],\n", " \"metadata\": {\"category\": \"ai\", \"type\": \"question\"}\n", " },\n", " {\n", - " \"text\": texts[1],\n", + " \"content\": texts[1],\n", " \"model_name\": model_name,\n", " \"embedding\": embeddings[1],\n", " \"metadata\": {\"category\": \"ai\", \"type\": \"question\"}\n", " },\n", " {\n", - " \"text\": texts[2],\n", + " \"content\": texts[2],\n", " \"model_name\": model_name,\n", " \"embedding\": embeddings[2],\n", " \"metadata\": {\"category\": \"ai\", \"type\": \"question\"}\n", @@ -421,8 +305,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "Batch operations are particularly beneficial when working with large numbers of embeddings. They provide the same functionality as individual operations but with better performance by reducing network roundtrips.\n", "\n", @@ -430,8 +314,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "### Working with TTL (Time-To-Live)\n", "\n", @@ -439,19 +323,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Immediately after setting: True\n", - "After waiting: False\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Create a cache with a default 5-second TTL\n", "ttl_cache = EmbeddingsCache(\n", @@ -462,7 +337,7 @@ "\n", "# Store an entry\n", "key = ttl_cache.set(\n", - " text=text,\n", + " content=text,\n", " model_name=model_name,\n", " embedding=embedding\n", ")\n", @@ -480,30 +355,19 @@ ] }, { - "cell_type": "markdown", "metadata": {}, - "source": [ - "You can also override the default TTL for individual entries:" - ] + "cell_type": "markdown", + "source": "You can also override the default TTL for individual entries:" }, { + "metadata": {}, "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Entry with custom TTL after 2 seconds: False\n", - "Entry with default TTL after 2 seconds: True\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Store an entry with a custom 1-second TTL\n", "key1 = ttl_cache.set(\n", - " text=\"Short-lived entry\",\n", + " content=\"Short-lived entry\",\n", " model_name=model_name,\n", " embedding=embedding,\n", " ttl=1 # Override with 1 second TTL\n", @@ -511,7 +375,7 @@ "\n", "# Store another entry with the default TTL (5 seconds)\n", "key2 = ttl_cache.set(\n", - " text=\"Default TTL entry\",\n", + " content=\"Default TTL entry\",\n", " model_name=model_name,\n", " embedding=embedding\n", " # No TTL specified = uses the default 5 seconds\n", @@ -532,8 +396,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Async Support\n", "\n", @@ -541,38 +405,29 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Async set successful? True\n", - "Async get successful? True\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "async def async_cache_demo():\n", " # Store an entry asynchronously\n", " key = await cache.aset(\n", - " text=\"Async embedding\",\n", + " content=\"Async embedding\",\n", " model_name=model_name,\n", " embedding=embedding,\n", " metadata={\"async\": True}\n", " )\n", - " \n", + "\n", " # Check if it exists\n", " exists = await cache.aexists_by_key(key)\n", " print(f\"Async set successful? {exists}\")\n", - " \n", + "\n", " # Retrieve it\n", " result = await cache.aget_by_key(key)\n", - " success = result is not None and result[\"text\"] == \"Async embedding\"\n", + " success = result is not None and result[\"content\"] == \"Async embedding\"\n", " print(f\"Async get successful? {success}\")\n", - " \n", + "\n", " # Remove it\n", " await cache.adrop_by_key(key)\n", "\n", @@ -581,8 +436,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Real-World Example\n", "\n", @@ -590,52 +445,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "13:06:20 sentence_transformers.SentenceTransformer INFO Use pytorch device_name: mps\n", - "13:06:20 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: redis/langcache-embed-v1\n", - "13:06:20 sentence_transformers.SentenceTransformer WARNING You try to use a model that was created with version 4.1.0, however, your version is 3.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.\n", - "\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 21.84it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.04it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.62it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.71it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Statistics:\n", - "Total queries: 5\n", - "Cache hits: 2\n", - "Cache misses: 3\n", - "Cache hit rate: 40.0%\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Create a fresh cache for this example\n", "example_cache = EmbeddingsCache(\n", @@ -665,12 +478,12 @@ "\n", "for query in queries:\n", " total_queries += 1\n", - " \n", + "\n", " # Check cache before computing\n", - " before = example_cache.exists(text=query, model_name=model_name)\n", + " before = example_cache.exists(content=query, model_name=model_name)\n", " if before:\n", " cache_hits += 1\n", - " \n", + "\n", " # Get embedding (will compute or use cache)\n", " embedding = vectorizer.embed(query)\n", "\n", @@ -686,12 +499,12 @@ "\n", "# Cleanup\n", "for query in set(queries): # Use set to get unique queries\n", - " example_cache.drop(text=query, model_name=model_name)" + " example_cache.drop(content=query, model_name=model_name)" ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Performance Benchmark\n", "\n", @@ -699,64 +512,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Benchmarking without caching:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 21.51it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 23.21it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 23.96it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 23.28it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.69it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.98it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 23.17it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 24.12it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 23.37it/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 23.24it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time taken without caching: 0.4549 seconds\n", - "Average time per embedding: 0.0455 seconds\n", - "\n", - "Benchmarking with caching:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 23.69it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time taken with caching: 0.0664 seconds\n", - "Average time per embedding: 0.0066 seconds\n", - "\n", - "Performance comparison:\n", - "Speedup with caching: 6.86x faster\n", - "Time saved: 0.3885 seconds (85.4%)\n", - "Latency reduction: 0.0389 seconds per query\n" - ] - } - ], + "outputs": [], + "execution_count": null, "source": [ "# Text to use for benchmarking\n", "benchmark_text = \"This is a benchmark text to measure the performance of embedding caching.\"\n", @@ -800,8 +559,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Common Use Cases for Embedding Caching\n", "\n", @@ -816,8 +575,8 @@ ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "## Cleanup\n", "\n", @@ -825,10 +584,10 @@ ] }, { - "cell_type": "code", - "execution_count": 15, "metadata": {}, + "cell_type": "code", "outputs": [], + "execution_count": null, "source": [ "# Clean up all caches\n", "cache.clear()\n", diff --git a/pyproject.toml b/pyproject.toml index 359a3284..8d24466d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "redisvl" -version = "0.12.1" +version = "0.13.0" description = "Python client library and CLI for using Redis as a vector database" authors = [{ name = "Redis Inc.", email = "applied.ai@redis.com" }] requires-python = ">=3.9.2,<3.14" @@ -48,6 +48,9 @@ bedrock = [ "boto3>=1.36.0,<2", "urllib3<2.2.0", ] +pillow = [ + "pillow>=11.3.0", +] [project.urls] Homepage = "https://github.com/redis/redis-vl-python" diff --git a/redisvl/extensions/cache/embeddings/embeddings.py b/redisvl/extensions/cache/embeddings/embeddings.py index 2049ce94..8e98b36e 100644 --- a/redisvl/extensions/cache/embeddings/embeddings.py +++ b/redisvl/extensions/cache/embeddings/embeddings.py @@ -1,6 +1,6 @@ """Embeddings cache implementation for RedisVL.""" -from typing import Any, Awaitable, Dict, List, Optional, Tuple, cast +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from redisvl.extensions.cache.base import BaseCache from redisvl.extensions.cache.embeddings.schema import CacheEntry @@ -54,34 +54,36 @@ def __init__( connection_kwargs=connection_kwargs, ) - def _make_entry_id(self, text: str, model_name: str) -> str: - """Generate a deterministic entry ID for the given text and model name. + def _make_entry_id(self, content: Union[bytes, str], model_name: str) -> str: + """Generate a deterministic entry ID for the given content and model name. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. Returns: - str: A deterministic entry ID based on the text and model name. + str: A deterministic entry ID based on the content and model name. """ - return hashify(f"{text}:{model_name}") + if isinstance(content, bytes): + content = content.hex() + return hashify(f"{content}:{model_name}") - def _make_cache_key(self, text: str, model_name: str) -> str: - """Generate a full Redis key for the given text and model name. + def _make_cache_key(self, content: Union[bytes, str], model_name: str) -> str: + """Generate a full Redis key for the given content and model name. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. Returns: str: The full Redis key. """ - entry_id = self._make_entry_id(text, model_name) + entry_id = self._make_entry_id(content, model_name) return self._make_key(entry_id) def _prepare_entry_data( self, - text: str, + content: Union[bytes, str], model_name: str, embedding: List[float], metadata: Optional[Dict[str, Any]] = None, @@ -89,7 +91,7 @@ def _prepare_entry_data( """Prepare data for storage in Redis Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. embedding (List[float]): The embedding vector. metadata (Optional[Dict[str, Any]]): Optional metadata. @@ -98,11 +100,11 @@ def _prepare_entry_data( Tuple[str, Dict[str, Any]]: A tuple of (key, entry_data) """ # Create cache entry with entry_id - entry_id = self._make_entry_id(text, model_name) + entry_id = self._make_entry_id(content, model_name) key = self._make_key(entry_id) entry = CacheEntry( entry_id=entry_id, - text=text, + content=content, model_name=model_name, embedding=embedding, metadata=metadata, @@ -136,16 +138,16 @@ def _should_warn_for_async_only(self) -> bool: def get( self, - text: str, + content: Union[bytes, str], model_name: str, ) -> Optional[Dict[str, Any]]: - """Get embedding by text and model name. + """Get embedding by content and model name. - Retrieves a cached embedding for the given text and model name. + Retrieves a cached embedding for the given content and model name. If found, refreshes the TTL of the entry. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. Returns: @@ -154,11 +156,11 @@ def get( .. code-block:: python embedding_data = cache.get( - text="What is machine learning?", + content="What is machine learning?", model_name="text-embedding-ada-002" ) """ - key = self._make_cache_key(text, model_name) + key = self._make_cache_key(content, model_name) return self.get_by_key(key) def get_by_key(self, key: str) -> Optional[Dict[str, Any]]: @@ -245,48 +247,50 @@ def mget_by_keys(self, keys: List[str]) -> List[Optional[Dict[str, Any]]]: return processed_results - def mget(self, texts: List[str], model_name: str) -> List[Optional[Dict[str, Any]]]: - """Get multiple embeddings by their texts and model name. + def mget( + self, contents: Iterable[Union[bytes, str]], model_name: str + ) -> List[Optional[Dict[str, Any]]]: + """Get multiple embeddings by their content and model name. Efficiently retrieves multiple cached embeddings in a single operation. If found, refreshes the TTL of each entry. Args: - texts (List[str]): List of text inputs that were embedded. + contents (Iterable[bytes | str]): Iterable of content that was embedded. model_name (str): The name of the embedding model. Returns: - List[Optional[Dict[str, Any]]]: List of embedding cache entries or None for texts not found. + List[Optional[Dict[str, Any]]]: List of embedding cache entries or None for contents not found. .. code-block:: python # Get multiple embeddings embedding_data = cache.mget( - texts=["What is machine learning?", "What is deep learning?"], + contents=["What is machine learning?", "What is deep learning?"], model_name="text-embedding-ada-002" ) """ - if not texts: - return [] + # Generate keys for each piece of content + keys = [self._make_cache_key(content, model_name) for content in contents] - # Generate keys for each text - keys = [self._make_cache_key(text, model_name) for text in texts] + if not keys: + return [] # Use the key-based batch operation return self.mget_by_keys(keys) def set( self, - text: str, + content: Union[bytes, str], model_name: str, embedding: List[float], metadata: Optional[Dict[str, Any]] = None, ttl: Optional[int] = None, ) -> str: - """Store an embedding with its text and model name. + """Store an embedding with its content and model name. Args: - text (str): The text input that was embedded. + content (Union[bytes, str]): The content to be embedded. model_name (str): The name of the embedding model. embedding (List[float]): The embedding vector to store. metadata (Optional[Dict[str, Any]]): Optional metadata to store with the embedding. @@ -298,7 +302,7 @@ def set( .. code-block:: python key = cache.set( - text="What is machine learning?", + content="What is machine learning?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3, ...], metadata={"source": "user_query"} @@ -306,7 +310,7 @@ def set( """ # Prepare data key, cache_entry = self._prepare_entry_data( - text, model_name, embedding, metadata + content, model_name, embedding, metadata ) if self._should_warn_for_async_only(): @@ -334,13 +338,13 @@ def mset( """Store multiple embeddings in a batch operation. Each item in the input list should be a dictionary with the following fields: - - 'text': The text input that was embedded + - 'content': The input that was embedded - 'model_name': The name of the embedding model - 'embedding': The embedding vector - 'metadata': Optional metadata to store with the embedding Args: - items: List of dictionaries, each containing text, model_name, embedding, and optional metadata. + items: List of dictionaries, each containing content, model_name, embedding, and optional metadata. ttl: Optional TTL override for these entries. Returns: @@ -351,13 +355,13 @@ def mset( # Store multiple embeddings keys = cache.mset([ { - "text": "What is ML?", + "content": "What is ML?", "model_name": "text-embedding-ada-002", "embedding": [0.1, 0.2, 0.3], "metadata": {"source": "user"} }, { - "text": "What is AI?", + "content": "What is AI?", "model_name": "text-embedding-ada-002", "embedding": [0.4, 0.5, 0.6], "metadata": {"source": "docs"} @@ -394,11 +398,11 @@ def mset( return keys - def exists(self, text: str, model_name: str) -> bool: - """Check if an embedding exists for the given text and model. + def exists(self, content: Union[bytes, str], model_name: str) -> bool: + """Check if an embedding exists for the given content and model. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. Returns: @@ -410,7 +414,7 @@ def exists(self, text: str, model_name: str) -> bool: print("Embedding is in cache") """ client = self._get_redis_client() - key = self._make_cache_key(text, model_name) + key = self._make_cache_key(content, model_name) return bool(client.exists(key)) def exists_by_key(self, key: str) -> bool: @@ -461,13 +465,15 @@ def mexists_by_keys(self, keys: List[str]) -> List[bool]: # Convert to boolean values return [bool(result) for result in results] - def mexists(self, texts: List[str], model_name: str) -> List[bool]: - """Check if multiple embeddings exist by their texts and model name. + def mexists( + self, contents: Iterable[Union[bytes, str]], model_name: str + ) -> List[bool]: + """Check if multiple embeddings exist by their contents and model name. Efficiently checks existence of multiple embeddings in a single operation. Args: - texts (List[str]): List of text inputs that were embedded. + contents (Iterable[bytes | str]): Iterable of content that was embedded. model_name (str): The name of the embedding model. Returns: @@ -477,34 +483,34 @@ def mexists(self, texts: List[str], model_name: str) -> List[bool]: # Check if multiple embeddings exist exists_results = cache.mexists( - texts=["What is machine learning?", "What is deep learning?"], + contents=["What is machine learning?", "What is deep learning?"], model_name="text-embedding-ada-002" ) """ - if not texts: - return [] + # Generate keys for each content + keys = [self._make_cache_key(content, model_name) for content in contents] - # Generate keys for each text - keys = [self._make_cache_key(text, model_name) for text in texts] + if not keys: + return [] # Use the key-based batch operation return self.mexists_by_keys(keys) - def drop(self, text: str, model_name: str) -> None: + def drop(self, content: Union[bytes, str], model_name: str) -> None: """Remove an embedding from the cache. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. .. code-block:: python cache.drop( - text="What is machine learning?", + content="What is machine learning?", model_name="text-embedding-ada-002" ) """ - key = self._make_cache_key(text, model_name) + key = self._make_cache_key(content, model_name) self.drop_by_key(key) def drop_by_key(self, key: str) -> None: @@ -543,44 +549,44 @@ def mdrop_by_keys(self, keys: List[str]) -> None: pipeline.delete(key) pipeline.execute() - def mdrop(self, texts: List[str], model_name: str) -> None: - """Remove multiple embeddings from the cache by their texts and model name. + def mdrop(self, contents: Iterable[Union[bytes, str]], model_name: str) -> None: + """Remove multiple embeddings from the cache by their contents and model name. Efficiently removes multiple embeddings in a single operation. Args: - texts (List[str]): List of text inputs that were embedded. + contents (Iterable[bytes | str]): Iterable of content that was embedded. model_name (str): The name of the embedding model. .. code-block:: python # Remove multiple embeddings cache.mdrop( - texts=["What is machine learning?", "What is deep learning?"], + contents=["What is machine learning?", "What is deep learning?"], model_name="text-embedding-ada-002" ) """ - if not texts: - return + # Generate keys for each content + keys = [self._make_cache_key(content, model_name) for content in contents] - # Generate keys for each text - keys = [self._make_cache_key(text, model_name) for text in texts] + if not keys: + return # Use the key-based batch operation self.mdrop_by_keys(keys) async def aget( self, - text: str, + content: Union[bytes, str], model_name: str, ) -> Optional[Dict[str, Any]]: - """Async get embedding by text and model name. + """Async get embedding by content and model name. - Asynchronously retrieves a cached embedding for the given text and model name. + Asynchronously retrieves a cached embedding for the given content and model name. If found, refreshes the TTL of the entry. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. Returns: @@ -589,11 +595,11 @@ async def aget( .. code-block:: python embedding_data = await cache.aget( - text="What is machine learning?", + content="What is machine learning?", model_name="text-embedding-ada-002" ) """ - key = self._make_cache_key(text, model_name) + key = self._make_cache_key(content, model_name) return await self.aget_by_key(key) async def aget_by_key(self, key: str) -> Optional[Dict[str, Any]]: @@ -666,51 +672,51 @@ async def amget_by_keys(self, keys: List[str]) -> List[Optional[Dict[str, Any]]] return processed_results async def amget( - self, texts: List[str], model_name: str + self, contents: Iterable[Union[bytes, str]], model_name: str ) -> List[Optional[Dict[str, Any]]]: - """Async get multiple embeddings by their texts and model name. + """Async get multiple embeddings by their contents and model name. Asynchronously retrieves multiple cached embeddings in a single operation. If found, refreshes the TTL of each entry. Args: - texts (List[str]): List of text inputs that were embedded. + contents (Iterable[bytes | str]): Iterable of content that was embedded. model_name (str): The name of the embedding model. Returns: - List[Optional[Dict[str, Any]]]: List of embedding cache entries or None for texts not found. + List[Optional[Dict[str, Any]]]: List of embedding cache entries or None for contents not found. .. code-block:: python # Get multiple embeddings asynchronously embedding_data = await cache.amget( - texts=["What is machine learning?", "What is deep learning?"], + contents=["What is machine learning?", "What is deep learning?"], model_name="text-embedding-ada-002" ) """ - if not texts: - return [] + # Generate keys for each content + keys = [self._make_cache_key(content, model_name) for content in contents] - # Generate keys for each text - keys = [self._make_cache_key(text, model_name) for text in texts] + if not keys: + return [] # Use the key-based batch operation return await self.amget_by_keys(keys) async def aset( self, - text: str, + content: Union[bytes, str], model_name: str, embedding: List[float], metadata: Optional[Dict[str, Any]] = None, ttl: Optional[int] = None, ) -> str: - """Async store an embedding with its text and model name. + """Async store an embedding with its content and model name. - Asynchronously stores an embedding with its text and model name. + Asynchronously stores an embedding with its content and model name. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. embedding (List[float]): The embedding vector to store. metadata (Optional[Dict[str, Any]]): Optional metadata to store with the embedding. @@ -722,7 +728,7 @@ async def aset( .. code-block:: python key = await cache.aset( - text="What is machine learning?", + content="What is machine learning?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3, ...], metadata={"source": "user_query"} @@ -730,7 +736,7 @@ async def aset( """ # Prepare data key, cache_entry = self._prepare_entry_data( - text, model_name, embedding, metadata + content, model_name, embedding, metadata ) # Store in Redis @@ -750,13 +756,13 @@ async def amset( """Async store multiple embeddings in a batch operation. Each item in the input list should be a dictionary with the following fields: - - 'text': The text input that was embedded + - 'content': The content that was embedded - 'model_name': The name of the embedding model - 'embedding': The embedding vector - 'metadata': Optional metadata to store with the embedding Args: - items: List of dictionaries, each containing text, model_name, embedding, and optional metadata. + items: List of dictionaries, each containing content, model_name, embedding, and optional metadata. ttl: Optional TTL override for these entries. Returns: @@ -767,13 +773,13 @@ async def amset( # Store multiple embeddings asynchronously keys = await cache.amset([ { - "text": "What is ML?", + "content": "What is ML?", "model_name": "text-embedding-ada-002", "embedding": [0.1, 0.2, 0.3], "metadata": {"source": "user"} }, { - "text": "What is AI?", + "content": "What is AI?", "model_name": "text-embedding-ada-002", "embedding": [0.4, 0.5, 0.6], "metadata": {"source": "docs"} @@ -833,13 +839,15 @@ async def amexists_by_keys(self, keys: List[str]) -> List[bool]: # Convert to boolean values return [bool(result) for result in results] - async def amexists(self, texts: List[str], model_name: str) -> List[bool]: - """Async check if multiple embeddings exist by their texts and model name. + async def amexists( + self, contents: Iterable[Union[bytes, str]], model_name: str + ) -> List[bool]: + """Async check if multiple embeddings exist by their contents and model name. Asynchronously checks existence of multiple embeddings in a single operation. Args: - texts (List[str]): List of text inputs that were embedded. + contents (Iterable[bytes | str]): Iterable of content that was embedded. model_name (str): The name of the embedding model. Returns: @@ -849,15 +857,15 @@ async def amexists(self, texts: List[str], model_name: str) -> List[bool]: # Check if multiple embeddings exist asynchronously exists_results = await cache.amexists( - texts=["What is machine learning?", "What is deep learning?"], + contents=["What is machine learning?", "What is deep learning?"], model_name="text-embedding-ada-002" ) """ - if not texts: - return [] + # Generate keys for each content + keys = [self._make_cache_key(content, model_name) for content in contents] - # Generate keys for each text - keys = [self._make_cache_key(text, model_name) for text in texts] + if not keys: + return [] # Use the key-based batch operation return await self.amexists_by_keys(keys) @@ -881,39 +889,41 @@ async def amdrop_by_keys(self, keys: List[str]) -> None: client = await self._get_async_redis_client() await client.delete(*keys) - async def amdrop(self, texts: List[str], model_name: str) -> None: - """Async remove multiple embeddings from the cache by their texts and model name. + async def amdrop( + self, contents: Iterable[Union[bytes, str]], model_name: str + ) -> None: + """Async remove multiple embeddings from the cache by their contents and model name. Asynchronously removes multiple embeddings in a single operation. Args: - texts (List[str]): List of text inputs that were embedded. + contents (Iterable[bytes | str]): Iterable of content that was embedded. model_name (str): The name of the embedding model. .. code-block:: python # Remove multiple embeddings asynchronously await cache.amdrop( - texts=["What is machine learning?", "What is deep learning?"], + contents=["What is machine learning?", "What is deep learning?"], model_name="text-embedding-ada-002" ) """ - if not texts: - return + # Generate keys for each content + keys = [self._make_cache_key(content, model_name) for content in contents] - # Generate keys for each text - keys = [self._make_cache_key(text, model_name) for text in texts] + if not keys: + return # Use the key-based batch operation await self.amdrop_by_keys(keys) - async def aexists(self, text: str, model_name: str) -> bool: + async def aexists(self, content: Union[bytes, str], model_name: str) -> bool: """Async check if an embedding exists. - Asynchronously checks if an embedding exists for the given text and model. + Asynchronously checks if an embedding exists for the given content and model. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. Returns: @@ -924,7 +934,7 @@ async def aexists(self, text: str, model_name: str) -> bool: if await cache.aexists("What is machine learning?", "text-embedding-ada-002"): print("Embedding is in cache") """ - key = self._make_cache_key(text, model_name) + key = self._make_cache_key(content, model_name) return await self.aexists_by_key(key) async def aexists_by_key(self, key: str) -> bool: @@ -946,23 +956,23 @@ async def aexists_by_key(self, key: str) -> bool: client = await self._get_async_redis_client() return bool(await client.exists(key)) - async def adrop(self, text: str, model_name: str) -> None: + async def adrop(self, content: Union[bytes, str], model_name: str) -> None: """Async remove an embedding from the cache. Asynchronously removes an embedding from the cache. Args: - text (str): The text input that was embedded. + content (bytes | str): The content that was embedded. model_name (str): The name of the embedding model. .. code-block:: python await cache.adrop( - text="What is machine learning?", + content="What is machine learning?", model_name="text-embedding-ada-002" ) """ - key = self._make_cache_key(text, model_name) + key = self._make_cache_key(content, model_name) await self.adrop_by_key(key) async def adrop_by_key(self, key: str) -> None: diff --git a/redisvl/extensions/cache/embeddings/schema.py b/redisvl/extensions/cache/embeddings/schema.py index 2239a3e9..50624271 100644 --- a/redisvl/extensions/cache/embeddings/schema.py +++ b/redisvl/extensions/cache/embeddings/schema.py @@ -4,7 +4,7 @@ related data structures. """ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -19,8 +19,8 @@ class CacheEntry(BaseModel): entry_id: str """Cache entry identifier""" - text: str - """The text input that was embedded""" + content: Union[bytes, str] + """The input that was embedded""" model_name: str """The name of the embedding model used""" embedding: List[float] diff --git a/redisvl/query/__init__.py b/redisvl/query/__init__.py index 6b46b54f..aa84633e 100644 --- a/redisvl/query/__init__.py +++ b/redisvl/query/__init__.py @@ -27,7 +27,6 @@ "TextQuery", "AggregationQuery", "AggregateHybridQuery", - "HybridQuery", "MultiVectorQuery", "Vector", ] diff --git a/redisvl/utils/utils.py b/redisvl/utils/utils.py index e4962dc2..85ce25ea 100644 --- a/redisvl/utils/utils.py +++ b/redisvl/utils/utils.py @@ -173,6 +173,46 @@ def wrapper(*args, **kwargs): return decorator +def deprecated_class(name: Optional[str] = None, replacement: Optional[str] = None): + """ + Decorator to mark a class as deprecated. + + When the decorated class is instantiated, the decorator will emit a + deprecation warning. + + Args: + name: Optional custom name for the class in the warning message. + If not provided, uses the class's __name__. + replacement: Optional message describing what to use instead. + + Example: + @deprecated_class(replacement="Use NewClass instead.") + class OldClass: + pass + """ + + def decorator(cls): + class_name = name or cls.__name__ + warning_message = ( + f"Class {class_name} is deprecated and will be " + "removed in the next major release. " + ) + if replacement: + warning_message += replacement + + original_init = cls.__init__ + + @wraps(original_init) + def new_init(self, *args, **kwargs): + warn(warning_message, category=DeprecationWarning, stacklevel=2) + original_init(self, *args, **kwargs) + + cls.__init__ = new_init + return cls + + return decorator + + def sync_wrapper(fn: Callable[[], Coroutine[Any, Any, Any]]) -> Callable[[], None]: def wrapper(): # Check if the interpreter is shutting down diff --git a/redisvl/utils/vectorize/__init__.py b/redisvl/utils/vectorize/__init__.py index 3b05ffd0..e4f99c73 100644 --- a/redisvl/utils/vectorize/__init__.py +++ b/redisvl/utils/vectorize/__init__.py @@ -1,8 +1,9 @@ import os -from typing import Optional from redisvl.extensions.cache.embeddings import EmbeddingsCache from redisvl.utils.vectorize.base import BaseVectorizer, Vectorizers +from redisvl.utils.vectorize.bedrock import BedrockVectorizer +from redisvl.utils.vectorize.custom import CustomVectorizer from redisvl.utils.vectorize.text.azureopenai import AzureOpenAITextVectorizer from redisvl.utils.vectorize.text.bedrock import BedrockTextVectorizer from redisvl.utils.vectorize.text.cohere import CohereTextVectorizer @@ -12,17 +13,23 @@ from redisvl.utils.vectorize.text.openai import OpenAITextVectorizer from redisvl.utils.vectorize.text.vertexai import VertexAITextVectorizer from redisvl.utils.vectorize.text.voyageai import VoyageAITextVectorizer +from redisvl.utils.vectorize.vertexai import VertexAIVectorizer +from redisvl.utils.vectorize.voyageai import VoyageAIVectorizer __all__ = [ "BaseVectorizer", "CohereTextVectorizer", "HFTextVectorizer", "OpenAITextVectorizer", + "VertexAIVectorizer", "VertexAITextVectorizer", "AzureOpenAITextVectorizer", "MistralAITextVectorizer", + "CustomVectorizer", "CustomTextVectorizer", + "BedrockVectorizer", "BedrockTextVectorizer", + "VoyageAIVectorizer", "VoyageAITextVectorizer", ] @@ -51,8 +58,8 @@ def vectorizer_from_dict( elif vectorizer_type == Vectorizers.mistral: return MistralAITextVectorizer(**args) elif vectorizer_type == Vectorizers.vertexai: - return VertexAITextVectorizer(**args) + return VertexAIVectorizer(**args) elif vectorizer_type == Vectorizers.voyageai: - return VoyageAITextVectorizer(**args) + return VoyageAIVectorizer(**args) else: raise ValueError(f"Unsupported vectorizer type: {vectorizer_type}") diff --git a/redisvl/utils/vectorize/base.py b/redisvl/utils/vectorize/base.py index c4ffcd3c..25b6b1d6 100644 --- a/redisvl/utils/vectorize/base.py +++ b/redisvl/utils/vectorize/base.py @@ -1,6 +1,8 @@ +import io import logging from enum import Enum -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field, field_validator from typing_extensions import Annotated @@ -8,6 +10,14 @@ from redisvl.extensions.cache.embeddings import EmbeddingsCache from redisvl.redis.utils import array_to_buffer from redisvl.schema.fields import VectorDataType +from redisvl.utils.utils import deprecated_argument + +try: + from PIL.Image import Image +except ImportError: + _PILLOW_INSTALLED = False +else: + _PILLOW_INSTALLED = True logger = logging.getLogger(__name__) @@ -25,7 +35,7 @@ class Vectorizers(Enum): class BaseVectorizer(BaseModel): """Base RedisVL vectorizer interface. - This class defines the interface for text vectorization with an optional + This class defines the interface for vectorization with an optional caching layer to improve performance by avoiding redundant API calls. Attributes: @@ -58,19 +68,22 @@ def check_dtype(cls, dtype): ) return dtype + @deprecated_argument("text", "content") def embed( self, - text: str, + content: Any = None, + text: Any = None, preprocess: Optional[Callable] = None, as_buffer: bool = False, skip_cache: bool = False, **kwargs, ) -> Union[List[float], bytes]: - """Generate a vector embedding for a text string. + """Generate a vector embedding for content. Args: - text: The text to convert to a vector embedding - preprocess: Function to apply to the text before embedding + content: The content to convert to a vector embedding + text: The text to convert to a vector embedding (deprecated - use `content` instead) + preprocess: Function to apply to the content before embedding as_buffer: Return the embedding as a binary buffer instead of a list skip_cache: Bypass the cache for this request **kwargs: Additional model-specific parameters @@ -79,18 +92,25 @@ def embed( The vector embedding as either a list of floats or binary buffer Examples: - >>> embedding = vectorizer.embed("Hello world") + >>> embedding = text_vectorizer.embed("Hello world") + >>> embedding = image_vectorizer.embed(Image.open("test.png")) """ + content = content or text + if not content: + raise ValueError("No content provided to embed.") + # Apply preprocessing if provided if preprocess is not None: - text = preprocess(text) + content = preprocess(content) # Check cache if available and not skipped if self.cache is not None and not skip_cache: try: - cache_result = self.cache.get(text=text, model_name=self.model) + cache_result = self.cache.get( + content=self._serialize_for_cache(content), model_name=self.model + ) if cache_result: - logger.debug(f"Cache hit for text with model {self.model}") + logger.debug(f"Cache hit for content with model {self.model}") return self._process_embedding( cache_result["embedding"], as_buffer, self.dtype ) @@ -99,13 +119,13 @@ def embed( # Generate embedding using provider-specific implementation cache_metadata = kwargs.pop("metadata", {}) - embedding = self._embed(text, **kwargs) + embedding = self._embed(content, **kwargs) # Store in cache if available and not skipped if self.cache is not None and not skip_cache: try: self.cache.set( - text=text, + content=self._serialize_for_cache(content), model_name=self.model, embedding=embedding, metadata=cache_metadata, @@ -116,50 +136,54 @@ def embed( # Process and return result return self._process_embedding(embedding, as_buffer, self.dtype) + @deprecated_argument("texts", "contents") def embed_many( self, - texts: List[str], + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, preprocess: Optional[Callable] = None, batch_size: int = 10, as_buffer: bool = False, skip_cache: bool = False, **kwargs, ) -> Union[List[List[float]], List[bytes]]: - """Generate vector embeddings for multiple texts efficiently. + """Generate vector embeddings for multiple items efficiently. Args: - texts: List of texts to convert to vector embeddings - preprocess: Function to apply to each text before embedding - batch_size: Number of texts to process in each API call + contents: List of content to convert to vector embeddings + texts: List of texts to convert to vector embeddings (deprecated - use `contents` instead) + preprocess: Function to apply to each item before embedding + batch_size: Number of items to process in each API call as_buffer: Return embeddings as binary buffers instead of lists skip_cache: Bypass the cache for this request **kwargs: Additional model-specific parameters Returns: - List of vector embeddings in the same order as the input texts + List of vector embeddings in the same order as the inputs Examples: >>> embeddings = vectorizer.embed_many(["Hello", "World"], batch_size=2) """ - if not texts: + contents = contents or texts + if not contents: return [] # Apply preprocessing if provided if preprocess is not None: - processed_texts = [preprocess(text) for text in texts] + processed_contents = [preprocess(item) for item in contents] else: - processed_texts = texts + processed_contents = contents # Get cached embeddings and identify misses results, cache_misses, cache_miss_indices = self._get_from_cache_batch( - processed_texts, skip_cache + processed_contents, skip_cache ) # Generate embeddings for cache misses if cache_misses: cache_metadata = kwargs.pop("metadata", {}) new_embeddings = self._embed_many( - texts=cache_misses, batch_size=batch_size, **kwargs + contents=cache_misses, batch_size=batch_size, **kwargs ) # Store new embeddings in cache @@ -174,19 +198,22 @@ def embed_many( # Process and return results return [self._process_embedding(emb, as_buffer, self.dtype) for emb in results] + @deprecated_argument("text", "content") async def aembed( self, - text: str, + content: Any = None, + text: Any = None, preprocess: Optional[Callable] = None, as_buffer: bool = False, skip_cache: bool = False, **kwargs, ) -> Union[List[float], bytes]: - """Asynchronously generate a vector embedding for a text string. + """Asynchronously generate a vector embedding for an item of content. Args: - text: The text to convert to a vector embedding - preprocess: Function to apply to the text before embedding + content: The content to convert to a vector embedding + text: The text to convert to a vector embedding (deprecated - use `content` instead) + preprocess: Function to apply to the content before embedding as_buffer: Return the embedding as a binary buffer instead of a list skip_cache: Bypass the cache for this request **kwargs: Additional model-specific parameters @@ -197,16 +224,22 @@ async def aembed( Examples: >>> embedding = await vectorizer.aembed("Hello world") """ + content = content or text + if not content: + raise ValueError("No content provided to embed.") + # Apply preprocessing if provided if preprocess is not None: - text = preprocess(text) + content = preprocess(content) # Check cache if available and not skipped if self.cache is not None and not skip_cache: try: - cache_result = await self.cache.aget(text=text, model_name=self.model) + cache_result = await self.cache.aget( + content=self._serialize_for_cache(content), model_name=self.model + ) if cache_result: - logger.debug(f"Async cache hit for text with model {self.model}") + logger.debug(f"Async cache hit for content with model {self.model}") return self._process_embedding( cache_result["embedding"], as_buffer, self.dtype ) @@ -217,13 +250,13 @@ async def aembed( # Generate embedding using provider-specific implementation cache_metadata = kwargs.pop("metadata", {}) - embedding = await self._aembed(text, **kwargs) + embedding = await self._aembed(content, **kwargs) # Store in cache if available and not skipped if self.cache is not None and not skip_cache: try: await self.cache.aset( - text=text, + content=self._serialize_for_cache(content), model_name=self.model, embedding=embedding, metadata=cache_metadata, @@ -236,50 +269,54 @@ async def aembed( # Process and return result return self._process_embedding(embedding, as_buffer, self.dtype) + @deprecated_argument("texts", "contents") async def aembed_many( self, - texts: List[str], + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, preprocess: Optional[Callable] = None, batch_size: int = 10, as_buffer: bool = False, skip_cache: bool = False, **kwargs, ) -> Union[List[List[float]], List[bytes]]: - """Asynchronously generate vector embeddings for multiple texts efficiently. + """Asynchronously generate vector embeddings for multiple items efficiently. Args: - texts: List of texts to convert to vector embeddings - preprocess: Function to apply to each text before embedding + contents: List of content to convert to vector embeddings + texts: List of texts to convert to vector embeddings (deprecated - use `contents` instead) + preprocess: Function to apply to each item before embedding batch_size: Number of texts to process in each API call as_buffer: Return embeddings as binary buffers instead of lists skip_cache: Bypass the cache for this request **kwargs: Additional model-specific parameters Returns: - List of vector embeddings in the same order as the input texts + List of vector embeddings in the same order as the inputs Examples: >>> embeddings = await vectorizer.aembed_many(["Hello", "World"], batch_size=2) """ - if not texts: + contents = contents or texts + if not contents: return [] # Apply preprocessing if provided if preprocess is not None: - processed_texts = [preprocess(text) for text in texts] + processed_contents = [preprocess(item) for item in contents] else: - processed_texts = texts + processed_contents = contents # Get cached embeddings and identify misses results, cache_misses, cache_miss_indices = await self._aget_from_cache_batch( - processed_texts, skip_cache + processed_contents, skip_cache ) # Generate embeddings for cache misses if cache_misses: cache_metadata = kwargs.pop("metadata", {}) new_embeddings = await self._aembed_many( - texts=cache_misses, batch_size=batch_size, **kwargs + contents=cache_misses, batch_size=batch_size, **kwargs ) # Store new embeddings in cache @@ -294,123 +331,143 @@ async def aembed_many( # Process and return results return [self._process_embedding(emb, as_buffer, self.dtype) for emb in results] - def _embed(self, text: str, **kwargs) -> List[float]: - """Generate a vector embedding for a single text.""" + @deprecated_argument("text", "content") + def _embed(self, text: Any = "", content: Any = "", **kwargs) -> List[float]: + """Generate a vector embedding for a single item.""" raise NotImplementedError + @deprecated_argument("texts", "contents") def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: - """Generate vector embeddings for a batch of texts.""" + """Generate vector embeddings for a batch of items.""" raise NotImplementedError - async def _aembed(self, text: str, **kwargs) -> List[float]: - """Asynchronously generate a vector embedding for a single text.""" + @deprecated_argument("text", "content") + async def _aembed(self, content: Any = "", text: Any = "", **kwargs) -> List[float]: + """Asynchronously generate a vector embedding for a single item.""" logger.warning( "This vectorizer has no async embed method. Falling back to sync." ) - return self._embed(text, **kwargs) + return self._embed(content=content or text, **kwargs) + @deprecated_argument("texts", "contents") async def _aembed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: - """Asynchronously generate vector embeddings for a batch of texts.""" + """Asynchronously generate vector embeddings for a batch of items.""" logger.warning( "This vectorizer has no async embed_many method. Falling back to sync." ) - return self._embed_many(texts, batch_size, **kwargs) + return self._embed_many( + contents=contents or texts, batch_size=batch_size, **kwargs + ) def _get_from_cache_batch( - self, texts: List[str], skip_cache: bool + self, contents: List[Any], skip_cache: bool ) -> tuple[List[Optional[List[float]]], List[str], List[int]]: """Get vector embeddings from cache and track cache misses. Args: - texts: List of texts to get from cache + contents: List of content to get from cache skip_cache: Whether to skip cache lookup Returns: Tuple of (results, cache_misses, cache_miss_indices) """ - results = [None] * len(texts) + results = [None] * len(contents) cache_misses = [] cache_miss_indices = [] # Skip cache if requested or no cache available if skip_cache or self.cache is None: - return results, texts, list(range(len(texts))) # type: ignore + return results, contents, list(range(len(contents))) # type: ignore try: # Efficient batch cache lookup - cache_results = self.cache.mget(texts=texts, model_name=self.model) + cache_results = self.cache.mget( + contents=(self._serialize_for_cache(c) for c in contents), + model_name=self.model, + ) # Process cache hits and collect misses - for i, (text, cache_result) in enumerate(zip(texts, cache_results)): + for i, (content, cache_result) in enumerate(zip(contents, cache_results)): if cache_result: results[i] = cache_result["embedding"] else: - cache_misses.append(text) + cache_misses.append(content) cache_miss_indices.append(i) logger.debug( - f"Cache hits: {len(texts) - len(cache_misses)}, misses: {len(cache_misses)}" + f"Cache hits: {len(contents) - len(cache_misses)}, misses: {len(cache_misses)}" ) except Exception as e: logger.warning(f"Error accessing embedding cache in batch: {str(e)}") - # On cache error, process all texts - cache_misses = texts - cache_miss_indices = list(range(len(texts))) + # On cache error, process all data + cache_misses = contents + cache_miss_indices = list(range(len(contents))) return results, cache_misses, cache_miss_indices # type: ignore async def _aget_from_cache_batch( - self, texts: List[str], skip_cache: bool + self, contents: List[Any], skip_cache: bool ) -> tuple[List[Optional[List[float]]], List[str], List[int]]: """Asynchronously get vector embeddings from cache and track cache misses. Args: - texts: List of texts to get from cache + contents: List of content to get from cache skip_cache: Whether to skip cache lookup Returns: Tuple of (results, cache_misses, cache_miss_indices) """ - results = [None] * len(texts) + results = [None] * len(contents) cache_misses = [] cache_miss_indices = [] # Skip cache if requested or no cache available if skip_cache or self.cache is None: - return results, texts, list(range(len(texts))) # type: ignore + return results, contents, list(range(len(contents))) # type: ignore try: # Efficient batch cache lookup - cache_results = await self.cache.amget(texts=texts, model_name=self.model) + cache_results = await self.cache.amget( + contents=(self._serialize_for_cache(c) for c in contents), + model_name=self.model, + ) # Process cache hits and collect misses - for i, (text, cache_result) in enumerate(zip(texts, cache_results)): + for i, (content, cache_result) in enumerate(zip(contents, cache_results)): if cache_result: results[i] = cache_result["embedding"] else: - cache_misses.append(text) + cache_misses.append(content) cache_miss_indices.append(i) logger.debug( - f"Async cache hits: {len(texts) - len(cache_misses)}, misses: {len(cache_misses)}" + f"Async cache hits: {len(contents) - len(cache_misses)}, misses: {len(cache_misses)}" ) except Exception as e: logger.warning( f"Error accessing embedding cache in batch asynchronously: {str(e)}" ) - # On cache error, process all texts - cache_misses = texts - cache_miss_indices = list(range(len(texts))) + # On cache error, process all data + cache_misses = contents + cache_miss_indices = list(range(len(contents))) return results, cache_misses, cache_miss_indices # type: ignore def _store_in_cache_batch( self, - texts: List[str], + contents: List[Any], embeddings: List[List[float]], metadata: Dict, skip_cache: bool, @@ -418,7 +475,7 @@ def _store_in_cache_batch( """Store a batch of vector embeddings in the cache. Args: - texts: List of texts that were embedded + contents: List of content that was embedded embeddings: List of vector embeddings metadata: Metadata to store with the embeddings skip_cache: Whether to skip cache storage @@ -430,12 +487,12 @@ def _store_in_cache_batch( # Prepare batch cache storage items cache_items = [ { - "text": text, + "content": self._serialize_for_cache(content), "model_name": self.model, "embedding": emb, "metadata": metadata, } - for text, emb in zip(texts, embeddings) + for content, emb in zip(contents, embeddings) ] self.cache.mset(items=cache_items) except Exception as e: @@ -443,7 +500,7 @@ def _store_in_cache_batch( async def _astore_in_cache_batch( self, - texts: List[str], + contents: List[Any], embeddings: List[List[float]], metadata: Dict, skip_cache: bool, @@ -451,7 +508,7 @@ async def _astore_in_cache_batch( """Asynchronously store a batch of vector embeddings in the cache. Args: - texts: List of texts that were embedded + contents: List of content that was embedded embeddings: List of vector embeddings metadata: Metadata to store with the embeddings skip_cache: Whether to skip cache storage @@ -463,12 +520,12 @@ async def _astore_in_cache_batch( # Prepare batch cache storage items cache_items = [ { - "text": text, + "content": self._serialize_for_cache(content), "model_name": self.model, "embedding": emb, "metadata": metadata, } - for text, emb in zip(texts, embeddings) + for content, emb in zip(contents, embeddings) ] await self.cache.amset(items=cache_items) except Exception as e: @@ -501,3 +558,20 @@ def _process_embedding( if as_buffer: return array_to_buffer(embedding, dtype) return embedding + + def _serialize_for_cache(self, content: Any) -> Union[bytes, str]: + """Convert content to a cacheable format.""" + if isinstance(content, str): + return content + elif isinstance(content, bytes): + return content + elif isinstance(content, Path): + return content.read_bytes() + elif _PILLOW_INSTALLED and isinstance(content, Image): + buffer = io.BytesIO() + content.save(buffer, format="PNG") + return buffer.getvalue() + + raise NotImplementedError( + f"Content type {type(content)} is not supported for caching." + ) diff --git a/redisvl/utils/vectorize/bedrock.py b/redisvl/utils/vectorize/bedrock.py new file mode 100644 index 00000000..1fec88ee --- /dev/null +++ b/redisvl/utils/vectorize/bedrock.py @@ -0,0 +1,322 @@ +import base64 +import io +import json +import os +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union + +from botocore.exceptions import ValidationError +from pydantic import ConfigDict +from tenacity import retry, stop_after_attempt, wait_random_exponential +from tenacity.retry import retry_if_not_exception_type + +if TYPE_CHECKING: + from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache + +from redisvl.utils.vectorize.base import BaseVectorizer + +try: + from PIL import Image +except ImportError: + _PILLOW_INSTALLED = False +else: + _PILLOW_INSTALLED = True + + +class BedrockVectorizer(BaseVectorizer): + """The BedrockVectorizer class utilizes Amazon Bedrock's API to generate + embeddings for text or image data. + + This vectorizer is designed to interact with Amazon Bedrock API, + requiring AWS credentials for authentication. The credentials can be provided + directly in the `api_config` dictionary or through environment variables: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_REGION (defaults to us-east-1) + + The vectorizer supports synchronous operations with batch processing and + preprocessing capabilities. + + You can optionally enable caching to improve performance when generating + embeddings for repeated inputs. + + .. code-block:: python + + # Basic usage with explicit credentials + vectorizer = BedrockVectorizer( + model="amazon.titan-embed-text-v2:0", + api_config={ + "aws_access_key_id": "your_access_key", + "aws_secret_access_key": "your_secret_key", + "aws_region": "us-east-1" + } + ) + + # With environment variables and caching + from redisvl.extensions.cache.embeddings import EmbeddingsCache + cache = EmbeddingsCache(name="bedrock_embeddings_cache") + + vectorizer = BedrockVectorizer( + model="amazon.titan-embed-text-v2:0", + cache=cache + ) + + # First call will compute and cache the embedding + embedding1 = vectorizer.embed("Hello, world!") + + # Second call will retrieve from cache + embedding2 = vectorizer.embed("Hello, world!") + + # Generate batch embeddings + embeddings = vectorizer.embed_many(["Hello", "World"], batch_size=2) + + # Multimodal usage + from pathlib import Path + vectorizer = BedrockVectorizer( + model="amazon.titan-embed-image-v1:0", + api_config={ + "aws_access_key_id": "your_access_key", + "aws_secret_access_key": "your_secret_key", + "aws_region": "us-east-1" + } + ) + image_embedding = vectorizer.embed(Path("path/to/your/image.jpg")) + + # Embedding a list of mixed modalities + embeddings = vectorizer.embed_many( + ["Hello", "world!", Path("path/to/your/image.jpg")], + batch_size=2 + ) + + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__( + self, + model: str = "amazon.titan-embed-text-v2:0", + api_config: Optional[Dict[str, str]] = None, + dtype: str = "float32", + cache: Optional["EmbeddingsCache"] = None, + **kwargs, + ) -> None: + """Initialize the AWS Bedrock Vectorizer. + + Args: + model (str): The Bedrock model ID to use. Defaults to amazon.titan-embed-text-v2:0 + api_config (Optional[Dict[str, str]]): AWS credentials and config. + Can include: aws_access_key_id, aws_secret_access_key, aws_region + If not provided, will use environment variables. + dtype (str): the default datatype to use when embedding text as byte arrays. + Used when setting `as_buffer=True` in calls to embed() and embed_many(). + Defaults to 'float32'. + cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for + better performance with repeated texts. Defaults to None. + + Raises: + ValueError: If credentials are not provided in config or environment. + ImportError: If boto3 is not installed. + ValueError: If an invalid dtype is provided. + """ + super().__init__(model=model, dtype=dtype, cache=cache) + # Initialize client and set up the model + self._setup(api_config, **kwargs) + + def embed_image(self, image_path: str, **kwargs) -> Union[List[float], bytes]: + """Embed an image (from its path on disk) using a Bedrock multimodal model.""" + return self.embed(Path(image_path), **kwargs) + + def _setup(self, api_config: Optional[Dict], **kwargs): + """Set up the Bedrock client and determine the embedding dimensions.""" + # Initialize client + self._initialize_client(api_config, **kwargs) + # Set model dimensions after initialization + self.dims = self._set_model_dims() + + def _initialize_client(self, api_config: Optional[Dict], **kwargs): + """ + Setup the Bedrock client using the provided API keys or + environment variables. + + Args: + api_config: Dictionary with AWS credentials and configuration + **kwargs: Additional arguments to pass to boto3 client + + Raises: + ImportError: If boto3 is not installed + ValueError: If AWS credentials are not provided + """ + try: + import boto3 # type: ignore + except ImportError: + raise ImportError( + "Amazon Bedrock vectorizer requires boto3. " + "Please install with `pip install boto3`" + ) + + if api_config is None: + api_config = {} + + aws_access_key_id = api_config.get( + "aws_access_key_id", os.getenv("AWS_ACCESS_KEY_ID") + ) + aws_secret_access_key = api_config.get( + "aws_secret_access_key", os.getenv("AWS_SECRET_ACCESS_KEY") + ) + aws_region = api_config.get("aws_region", os.getenv("AWS_REGION", "us-east-1")) + + if not aws_access_key_id or not aws_secret_access_key: + raise ValueError( + "AWS credentials required. Provide via api_config or environment variables " + "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY" + ) + + # Store client as a regular attribute instead of PrivateAttr + self._client = boto3.client( + "bedrock-runtime", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=aws_region, + **kwargs, + ) + + def _set_model_dims(self) -> int: + """ + Determine the dimensionality of the embedding model by making a test call. + + Returns: + int: Dimensionality of the embedding model + + Raises: + ValueError: If embedding dimensions cannot be determined + """ + try: + # Call the protected _embed method to avoid caching this test embedding + embedding = self._embed("dimension check") + return len(embedding) + except (KeyError, IndexError) as ke: + raise ValueError(f"Unexpected response from the Bedrock API: {str(ke)}") + except Exception as e: # pylint: disable=broad-except + # fall back (TODO get more specific) + raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def _embed(self, content: Any, **kwargs) -> List[float]: + """ + Generate a vector embedding for a single input using the AWS Bedrock API. + + Args: + content: Text or PIL.Image.Image or Path to image-file to embed + **kwargs: Additional parameters to pass to the AWS Bedrock API + + Returns: + List[float]: Vector embedding as a list of floats + + Raises: + TypeError: If content is not a string, Path, or PIL.Image.Image + ValueError: If attempting to embed an image with a text model + ValueError: If embedding fails + """ + body = self._serialize_request_body(content) + + try: + response = self._client.invoke_model( + modelId=self.model, body=json.dumps(body), **kwargs + ) + response_body = json.loads(response["body"].read()) + return response_body["embedding"] + except ValidationError as e: + if "Malformed input request" in str(e) and "inputImage" in body: + raise ValueError("Attempted to embed image with text model.") from e + raise ValueError(f"Embedding text failed: {e}") + except Exception as e: + raise ValueError(f"Embedding text failed: {e}") + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def _embed_many( + self, contents: List[Any], batch_size: int = 10, **kwargs + ) -> List[List[float]]: + """ + Generate vector embeddings for a batch of inputs using the AWS Bedrock API. + + Args: + contents: List of text/images to embed. Images must be Paths to image-file or PIL.Image.Image + batch_size: Number of inputs to process in each API call + **kwargs: Additional parameters to pass to the AWS Bedrock API + + Returns: + List[List[float]]: List of vector embeddings as lists of floats + + Raises: + TypeError: If `contents` is not a list + TypeError: If each item in `contents` is not a string, Path, or PIL.Image.Image + ValueError: If attempting to embed an image with a text model + ValueError: If embedding fails + """ + if not isinstance(contents, list): + raise TypeError("`contents` must be a list") + + try: + embeddings: List[List[float]] = [] + + for batch in self.batchify(contents, batch_size): + # Process each text in the batch individually since Bedrock + # doesn't support batch embedding + batch_embeddings = [] + for content in batch: + body = self._serialize_request_body(content) + + try: + response = self._client.invoke_model( + modelId=self.model, + body=json.dumps(body), + **kwargs, + ) + except ValidationError as e: + if "Malformed input request" in str(e) and "inputImage" in body: + raise ValueError( + "Attempted to embed image with text model." + ) from e + raise e + + response_body = json.loads(response["body"].read()) + batch_embeddings.append(response_body["embedding"]) + + embeddings.extend(batch_embeddings) + + return embeddings + except Exception as e: + raise ValueError(f"Embedding texts failed: {e}") + + def _serialize_request_body( + self, content: Any + ) -> dict[Literal["inputText", "inputImage"], str]: + """Serialize the request body for the Bedrock API.""" + if isinstance(content, str): + return {"inputText": content} + elif isinstance(content, Path): + return {"inputImage": self._b64encode_image(content.read_bytes())} + elif _PILLOW_INSTALLED and isinstance(content, Image.Image): + bytes_data = io.BytesIO() + content.save(bytes_data, format="PNG") + return {"inputImage": self._b64encode_image(bytes_data.getvalue())} + raise TypeError( + "Content must be a string, Path to image-file, or PIL.Image.Image" + ) + + @staticmethod + def _b64encode_image(bytes_data: bytes) -> str: + """Encode an image as a base64 string.""" + return base64.b64encode(bytes_data).decode("utf-8") + + @property + def type(self) -> str: + return "bedrock" diff --git a/redisvl/utils/vectorize/custom.py b/redisvl/utils/vectorize/custom.py new file mode 100644 index 00000000..f42091b8 --- /dev/null +++ b/redisvl/utils/vectorize/custom.py @@ -0,0 +1,267 @@ +from typing import TYPE_CHECKING, Any, Callable, List, Optional + +from pydantic import ConfigDict + +if TYPE_CHECKING: + from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache + +from redisvl.utils.vectorize.base import BaseVectorizer + + +def _check_vector(result: list, method_name: str) -> None: + """ + Validates the structure of returned embeddings. + + - For methods named "*_many", expects a list of lists of floats. + - For single methods, expects a list of floats. + + Raises: + ValueError: If the embeddings do not match the expected structure. + """ + if method_name.endswith("_many"): + # embed_many / aembed_many → list of lists + if not isinstance(result, list) or not result: + raise ValueError(f"{method_name} must return a non-empty list of lists.") + if not isinstance(result[0], list) or not result[0]: + raise ValueError(f"{method_name} must return a list of non-empty lists.") + if not isinstance(result[0][0], float): + raise ValueError(f"{method_name} must return a list of lists of floats.") + else: + # embed / aembed → a single list of floats + if not isinstance(result, list) or not result: + raise ValueError(f"{method_name} must return a non-empty list.") + if not isinstance(result[0], float): + raise ValueError(f"{method_name} must return a list of floats.") + + +class CustomVectorizer(BaseVectorizer): + """The CustomVectorizer class wraps user-defined embedding methods to create + embeddings for data. + + This vectorizer is designed to accept a provided callable vectorizer and + provides a class definition to allow for compatibility with RedisVL. + The vectorizer may support both synchronous and asynchronous operations which + allows for batch processing of inputs, but at a minimum only synchronous embedding + is required to satisfy the 'embed()' method. + + You can optionally enable caching to improve performance when generating + embeddings for repeated inputs. + + .. code-block:: python + + # Basic usage with a custom embedding function + vectorizer = CustomVectorizer( + embed = my_vectorizer.generate_embedding + ) + embedding = vectorizer.embed("Hello, world!") + + # With caching enabled + from redisvl.extensions.cache.embeddings import EmbeddingsCache + cache = EmbeddingsCache(name="my_embeddings_cache") + + vectorizer = CustomVectorizer( + embed=my_vectorizer.generate_embedding, + cache=cache + ) + + # First call will compute and cache the embedding + embedding1 = vectorizer.embed("Hello, world!") + + # Second call will retrieve from cache + embedding2 = vectorizer.embed("Hello, world!") + + # Asynchronous batch embedding of multiple texts + embeddings = await vectorizer.aembed_many( + ["Hello, world!", "How are you?"], + batch_size=2 + ) + + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__( + self, + embed: Callable, + embed_many: Optional[Callable] = None, + aembed: Optional[Callable] = None, + aembed_many: Optional[Callable] = None, + dtype: str = "float32", + cache: Optional["EmbeddingsCache"] = None, + ): + """Initialize the Custom vectorizer. + + Args: + embed (Callable): a Callable function that accepts an object and returns a list of floats. + embed_many (Optional[Callable]): a Callable function that accepts a list of objects and returns a list containing lists of floats. Defaults to None. + aembed (Optional[Callable]): an asynchronous Callable function that accepts a object and returns a lists of floats. Defaults to None. + aembed_many (Optional[Callable]): an asynchronous Callable function that accepts a list of objects and returns a list containing lists of floats. Defaults to None. + dtype (str): the default datatype to use when embedding inputs as byte arrays. + Used when setting `as_buffer=True` in calls to embed() and embed_many(). + Defaults to 'float32'. + cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for + better performance with repeated inputs. Defaults to None. + + Raises: + ValueError: if embedding validation fails. + """ + # First, determine the dimensions + try: + test_result = embed("dimension test") + _check_vector(test_result, "embed") + dims = len(test_result) + except Exception as e: + raise ValueError(f"Failed to validate embed method: {e}") + + # Initialize parent with known information + super().__init__(model="custom", dtype=dtype, dims=dims, cache=cache) + + # Now setup the functions and validation flags + self._setup_functions(embed, embed_many, aembed, aembed_many) + + def _setup_functions(self, embed, embed_many, aembed, aembed_many): + """Setup the user-provided embedding functions.""" + self._embed_func = embed + self._embed_func_many = embed_many + self._aembed_func = aembed + self._aembed_func_many = aembed_many + + # Initialize validation flags + self._aembed_validated = False + self._aembed_many_validated = False + + # Validate the other functions if provided + self._validate_optional_funcs() + + @property + def type(self) -> str: + return "custom" + + def _validate_optional_funcs(self) -> None: + """ + Optionally validate the other user-provided functions if they exist. + + Raises: + ValueError: If any provided function produces invalid results. + """ + # Check embed_many if provided + if self._embed_func_many: + try: + test_batch = self._embed_func_many(["dimension test (many)"]) + _check_vector(test_batch, "embed_many") + except Exception as e: + raise ValueError(f"Invalid embed_many function: {e}") + + def _embed(self, content: Any, **kwargs) -> List[float]: + """Generate a vector embedding for a single input using the provided user function. + + Args: + content: Input to embed + **kwargs: Additional parameters to pass to the user function + + Returns: + List[float]: Vector embedding as a list of floats + + Raises: + ValueError: If embedding fails + """ + try: + result = self._embed_func(content, **kwargs) + return result + except Exception as e: + raise ValueError(f"Embedding input failed: {e}") + + def _embed_many( + self, contents: List[Any], batch_size: int = 10, **kwargs + ) -> List[List[float]]: + """Generate vector embeddings for a batch of inputs using the provided user function. + + Args: + contents: List of inputs to embed + batch_size: Number of inputs to process in each batch + **kwargs: Additional parameters to pass to the user function + + Returns: + List[List[float]]: List of vector embeddings as lists of floats + + Raises: + TypeError: If contents is not a list + ValueError: If embedding fails + """ + if not isinstance(contents, list): + raise TypeError("Must pass in a list of values to embed.") + + if not self._embed_func_many: + # Fallback: Use _embed for each text if no batch function provided + return [self._embed(content, **kwargs) for content in contents] + + try: + results = self._embed_func_many(contents, **kwargs) + return results + except Exception as e: + raise ValueError(f"Embedding inputs failed: {e}") + + async def _aembed(self, content: Any, **kwargs) -> List[float]: + """Asynchronously generate a vector embedding for a single input. + + Args: + content: Input to embed + **kwargs: Additional parameters to pass to the user async function + + Returns: + List[float]: Vector embedding as a list of floats + + Raises: + NotImplementedError: If no aembed function was provided + ValueError: If embedding fails + """ + if not self._aembed_func: + return self._embed(content, **kwargs) + + try: + result = await self._aembed_func(content, **kwargs) + + # Validate result on first call + if not self._aembed_validated: + _check_vector(result, "aembed") + self._aembed_validated = True + + return result + except Exception as e: + raise ValueError(f"Embedding input failed: {e}") + + async def _aembed_many( + self, contents: List[Any], batch_size: int = 10, **kwargs + ) -> List[List[float]]: + """Asynchronously generate vector embeddings for a batch of inputs. + + Args: + contents: List of inputs to embed + batch_size: Number of inputs to process in each batch + **kwargs: Additional parameters to pass to the user async function + + Returns: + List[List[float]]: List of vector embeddings as lists of floats + + Raises: + TypeError: If contents is not a list + NotImplementedError: If no aembed_many function was provided + ValueError: If embedding fails + """ + if not isinstance(contents, list): + raise TypeError("Must pass in a list of values to embed.") + + if not self._aembed_func_many: + return self._embed_many(contents, batch_size, **kwargs) + + try: + results = await self._aembed_func_many(contents, **kwargs) + + # Validate result on first call + if not self._aembed_many_validated: + _check_vector(results, "aembed_many") + self._aembed_many_validated = True + + return results + except Exception as e: + raise ValueError(f"Embedding inputs failed: {e}") diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py index eaf61a3e..5bc3077e 100644 --- a/redisvl/utils/vectorize/text/azureopenai.py +++ b/redisvl/utils/vectorize/text/azureopenai.py @@ -214,17 +214,19 @@ def _set_model_dims(self) -> int: # fall back (TODO get more specific) raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + @deprecated_argument("text", "content") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) - def _embed(self, text: str, **kwargs) -> List[float]: + def _embed(self, content: str = "", text: str = "", **kwargs) -> List[float]: """ Generate a vector embedding for a single text using the AzureOpenAI API. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional parameters to pass to the AzureOpenAI API Returns: @@ -234,30 +236,37 @@ def _embed(self, text: str, **kwargs) -> List[float]: TypeError: If text is not a string ValueError: If embedding fails """ - if not isinstance(text, str): + content = content or text + if not isinstance(content, str): raise TypeError("Must pass in a str value to embed.") try: result = self._client.embeddings.create( - input=[text], model=self.model, **kwargs + input=[content], model=self.model, **kwargs ) return result.data[0].embedding except Exception as e: raise ValueError(f"Embedding text failed: {e}") + @deprecated_argument("texts", "contents") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: """ Generate vector embeddings for a batch of texts using the AzureOpenAI API. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each API call **kwargs: Additional parameters to pass to the AzureOpenAI API @@ -265,17 +274,18 @@ def _embed_many( List[List[float]]: List of vector embeddings as lists of floats Raises: - TypeError: If texts is not a list of strings + TypeError: If contents is not a list of strings ValueError: If embedding fails """ - if not isinstance(texts, list): + contents = contents or texts + if not isinstance(contents, list): raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): + if contents and not isinstance(contents[0], str): raise TypeError("Must pass in a list of str values to embed.") try: embeddings: List = [] - for batch in self.batchify(texts, batch_size): + for batch in self.batchify(contents, batch_size): response = self._client.embeddings.create( input=batch, model=self.model, **kwargs ) @@ -284,50 +294,59 @@ def _embed_many( except Exception as e: raise ValueError(f"Embedding texts failed: {e}") + @deprecated_argument("text", "content") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) - async def _aembed(self, text: str, **kwargs) -> List[float]: + async def _aembed(self, content: str = "", text: str = "", **kwargs) -> List[float]: """ Asynchronously generate a vector embedding for a single text using the AzureOpenAI API. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional parameters to pass to the AzureOpenAI API Returns: List[float]: Vector embedding as a list of floats Raises: - TypeError: If text is not a string + TypeError: If content is not a string ValueError: If embedding fails """ - if not isinstance(text, str): + content = content or text + if not isinstance(content, str): raise TypeError("Must pass in a str value to embed.") try: result = await self._aclient.embeddings.create( - input=[text], model=self.model, **kwargs + input=[content], model=self.model, **kwargs ) return result.data[0].embedding except Exception as e: raise ValueError(f"Embedding text failed: {e}") + @deprecated_argument("texts", "contents") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) async def _aembed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: """ Asynchronously generate vector embeddings for a batch of texts using the AzureOpenAI API. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each API call **kwargs: Additional parameters to pass to the AzureOpenAI API @@ -335,17 +354,18 @@ async def _aembed_many( List[List[float]]: List of vector embeddings as lists of floats Raises: - TypeError: If texts is not a list of strings + TypeError: If contents is not a list of strings ValueError: If embedding fails """ - if not isinstance(texts, list): + contents = contents or texts + if not isinstance(contents, list): raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): + if contents and not isinstance(contents[0], str): raise TypeError("Must pass in a list of str values to embed.") try: embeddings: List = [] - for batch in self.batchify(texts, batch_size): + for batch in self.batchify(contents, batch_size): response = await self._aclient.embeddings.create( input=batch, model=self.model, **kwargs ) diff --git a/redisvl/utils/vectorize/text/bedrock.py b/redisvl/utils/vectorize/text/bedrock.py index ac4bb415..8850c152 100644 --- a/redisvl/utils/vectorize/text/bedrock.py +++ b/redisvl/utils/vectorize/text/bedrock.py @@ -1,254 +1,36 @@ -import json -import os -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import Any, List, Optional, Union -from pydantic import ConfigDict -from tenacity import retry, stop_after_attempt, wait_random_exponential -from tenacity.retry import retry_if_not_exception_type +from redisvl.utils.utils import deprecated_argument, deprecated_class +from redisvl.utils.vectorize.bedrock import BedrockVectorizer -if TYPE_CHECKING: - from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache -from redisvl.utils.utils import deprecated_argument -from redisvl.utils.vectorize.base import BaseVectorizer +@deprecated_class( + name="BedrockTextVectorizer", replacement="Use BedrockVectorizer instead." +) +class BedrockTextVectorizer(BedrockVectorizer): + """A backwards-compatible alias for BedrockVectorizer.""" + @deprecated_argument("text", "content") + def embed( + self, content: Any = "", text: Any = "", **kwargs + ) -> Union[List[float], bytes]: + """Generate a vector embedding for a single input using the AWS Bedrock API. -class BedrockTextVectorizer(BaseVectorizer): - """The AmazonBedrockTextVectorizer class utilizes Amazon Bedrock's API to generate - embeddings for text data. - - This vectorizer is designed to interact with Amazon Bedrock API, - requiring AWS credentials for authentication. The credentials can be provided - directly in the `api_config` dictionary or through environment variables: - - AWS_ACCESS_KEY_ID - - AWS_SECRET_ACCESS_KEY - - AWS_REGION (defaults to us-east-1) - - The vectorizer supports synchronous operations with batch processing and - preprocessing capabilities. - - You can optionally enable caching to improve performance when generating - embeddings for repeated text inputs. - - .. code-block:: python - - # Basic usage with explicit credentials - vectorizer = AmazonBedrockTextVectorizer( - model="amazon.titan-embed-text-v2:0", - api_config={ - "aws_access_key_id": "your_access_key", - "aws_secret_access_key": "your_secret_key", - "aws_region": "us-east-1" - } - ) - - # With environment variables and caching - from redisvl.extensions.cache.embeddings import EmbeddingsCache - cache = EmbeddingsCache(name="bedrock_embeddings_cache") - - vectorizer = AmazonBedrockTextVectorizer( - model="amazon.titan-embed-text-v2:0", - cache=cache - ) - - # First call will compute and cache the embedding - embedding1 = vectorizer.embed("Hello, world!") - - # Second call will retrieve from cache - embedding2 = vectorizer.embed("Hello, world!") - - # Generate batch embeddings - embeddings = vectorizer.embed_many(["Hello", "World"], batch_size=2) - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) + Deprecated: Use `BedrockVectorizer.embed` instead. + """ + content = content or text + return super().embed(content=content, **kwargs) - def __init__( + @deprecated_argument("texts", "contents") + def embed_many( self, - model: str = "amazon.titan-embed-text-v2:0", - api_config: Optional[Dict[str, str]] = None, - dtype: str = "float32", - cache: Optional["EmbeddingsCache"] = None, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, **kwargs, - ) -> None: - """Initialize the AWS Bedrock Vectorizer. - - Args: - model (str): The Bedrock model ID to use. Defaults to amazon.titan-embed-text-v2:0 - api_config (Optional[Dict[str, str]]): AWS credentials and config. - Can include: aws_access_key_id, aws_secret_access_key, aws_region - If not provided, will use environment variables. - dtype (str): the default datatype to use when embedding text as byte arrays. - Used when setting `as_buffer=True` in calls to embed() and embed_many(). - Defaults to 'float32'. - cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for - better performance with repeated texts. Defaults to None. - - Raises: - ValueError: If credentials are not provided in config or environment. - ImportError: If boto3 is not installed. - ValueError: If an invalid dtype is provided. - """ - super().__init__(model=model, dtype=dtype, cache=cache) - # Initialize client and set up the model - self._setup(api_config, **kwargs) - - def _setup(self, api_config: Optional[Dict], **kwargs): - """Set up the Bedrock client and determine the embedding dimensions.""" - # Initialize client - self._initialize_client(api_config, **kwargs) - # Set model dimensions after initialization - self.dims = self._set_model_dims() - - def _initialize_client(self, api_config: Optional[Dict], **kwargs): - """ - Setup the Bedrock client using the provided API keys or - environment variables. - - Args: - api_config: Dictionary with AWS credentials and configuration - **kwargs: Additional arguments to pass to boto3 client - - Raises: - ImportError: If boto3 is not installed - ValueError: If AWS credentials are not provided - """ - try: - import boto3 # type: ignore - except ImportError: - raise ImportError( - "Amazon Bedrock vectorizer requires boto3. " - "Please install with `pip install boto3`" - ) - - if api_config is None: - api_config = {} - - aws_access_key_id = api_config.get( - "aws_access_key_id", os.getenv("AWS_ACCESS_KEY_ID") - ) - aws_secret_access_key = api_config.get( - "aws_secret_access_key", os.getenv("AWS_SECRET_ACCESS_KEY") - ) - aws_region = api_config.get("aws_region", os.getenv("AWS_REGION", "us-east-1")) - - if not aws_access_key_id or not aws_secret_access_key: - raise ValueError( - "AWS credentials required. Provide via api_config or environment variables " - "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY" - ) - - # Store client as a regular attribute instead of PrivateAttr - self._client = boto3.client( - "bedrock-runtime", - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - region_name=aws_region, - **kwargs, - ) - - def _set_model_dims(self) -> int: - """ - Determine the dimensionality of the embedding model by making a test call. - - Returns: - int: Dimensionality of the embedding model - - Raises: - ValueError: If embedding dimensions cannot be determined - """ - try: - # Call the protected _embed method to avoid caching this test embedding - embedding = self._embed("dimension check") - return len(embedding) - except (KeyError, IndexError) as ke: - raise ValueError(f"Unexpected response from the Bedrock API: {str(ke)}") - except Exception as e: # pylint: disable=broad-except - # fall back (TODO get more specific) - raise ValueError(f"Error setting embedding model dimensions: {str(e)}") - - @retry( - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), - retry=retry_if_not_exception_type(TypeError), - ) - def _embed(self, text: str, **kwargs) -> List[float]: - """ - Generate a vector embedding for a single text using the AWS Bedrock API. - - Args: - text: Text to embed - **kwargs: Additional parameters to pass to the AWS Bedrock API - - Returns: - List[float]: Vector embedding as a list of floats - - Raises: - TypeError: If text is not a string - ValueError: If embedding fails - """ - if not isinstance(text, str): - raise TypeError("Text must be a string") - - try: - response = self._client.invoke_model( - modelId=self.model, body=json.dumps({"inputText": text}), **kwargs - ) - response_body = json.loads(response["body"].read()) - return response_body["embedding"] - except Exception as e: - raise ValueError(f"Embedding text failed: {e}") - - @retry( - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), - retry=retry_if_not_exception_type(TypeError), - ) - def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs ) -> List[List[float]]: - """ - Generate vector embeddings for a batch of texts using the AWS Bedrock API. - - Args: - texts: List of texts to embed - batch_size: Number of texts to process in each API call - **kwargs: Additional parameters to pass to the AWS Bedrock API - - Returns: - List[List[float]]: List of vector embeddings as lists of floats + """Generate vector embeddings for a batch of inputs using the AWS Bedrock API. - Raises: - TypeError: If texts is not a list of strings - ValueError: If embedding fails + Deprecated: Use `BedrockVectorizer.embed_many` instead. """ - if not isinstance(texts, list): - raise TypeError("Texts must be a list of strings") - if texts and not isinstance(texts[0], str): - raise TypeError("Texts must be a list of strings") - - try: - embeddings: List[List[float]] = [] - - for batch in self.batchify(texts, batch_size): - # Process each text in the batch individually since Bedrock - # doesn't support batch embedding - batch_embeddings = [] - for text in batch: - response = self._client.invoke_model( - modelId=self.model, - body=json.dumps({"inputText": text}), - **kwargs, - ) - response_body = json.loads(response["body"].read()) - batch_embeddings.append(response_body["embedding"]) - - embeddings.extend(batch_embeddings) - - return embeddings - except Exception as e: - raise ValueError(f"Embedding texts failed: {e}") - - @property - def type(self) -> str: - return "bedrock" + contents = contents or texts + return super().embed_many(contents=contents, **kwargs) diff --git a/redisvl/utils/vectorize/text/cohere.py b/redisvl/utils/vectorize/text/cohere.py index 7e5c0e38..f73a4522 100644 --- a/redisvl/utils/vectorize/text/cohere.py +++ b/redisvl/utils/vectorize/text/cohere.py @@ -205,12 +205,16 @@ def _validate_input_type(self, input_type) -> None: "See https://docs.cohere.com/reference/embed." ) - def _embed(self, text: str, **kwargs) -> List[Union[float, int]]: + @deprecated_argument("text", "content") + def _embed( + self, content: str = "", text: str = "", **kwargs + ) -> List[Union[float, int]]: """ Generate a vector embedding for a single text using the Cohere API. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional parameters to pass to the Cohere API, must include 'input_type' @@ -225,7 +229,8 @@ def _embed(self, text: str, **kwargs) -> List[Union[float, int]]: TypeError: If text is not a string or input_type is not provided ValueError: If embedding fails """ - if not isinstance(text, str): + content = content or text + if not isinstance(content, str): raise TypeError("Must pass in a str value to embed.") input_type = kwargs.pop("input_type", None) @@ -246,7 +251,7 @@ def _embed(self, text: str, **kwargs) -> List[Union[float, int]]: try: response = self._client.embed( - texts=[text], + texts=[content], model=self.model, input_type=input_type, embedding_types=embedding_types, @@ -264,19 +269,25 @@ def _embed(self, text: str, **kwargs) -> List[Union[float, int]]: except Exception as e: raise ValueError(f"Embedding text failed: {e}") + @deprecated_argument("texts", "contents") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[Union[float, int]]]: """ Generate vector embeddings for a batch of texts using the Cohere API. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each API call **kwargs: Additional parameters to pass to the Cohere API, must include 'input_type' @@ -285,12 +296,13 @@ def _embed_many( List[List[Union[float, int]]]: List of vector embeddings Raises: - TypeError: If texts is not a list of strings or input_type is not provided + TypeError: If contents is not a list of strings or input_type is not provided ValueError: If embedding fails """ - if not isinstance(texts, list): + contents = contents or texts + if not isinstance(contents, list): raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): + if contents and not isinstance(contents[0], str): raise TypeError("Must pass in a list of str values to embed.") input_type = kwargs.pop("input_type", None) @@ -310,7 +322,7 @@ def _embed_many( embedding_types = self._get_cohere_embedding_type(self.dtype) embeddings: List = [] - for batch in self.batchify(texts, batch_size): + for batch in self.batchify(contents, batch_size): try: response = self._client.embed( texts=batch, diff --git a/redisvl/utils/vectorize/text/custom.py b/redisvl/utils/vectorize/text/custom.py index 0b8f425e..3f197d5f 100644 --- a/redisvl/utils/vectorize/text/custom.py +++ b/redisvl/utils/vectorize/text/custom.py @@ -1,279 +1,57 @@ -from typing import TYPE_CHECKING, Callable, List, Optional +from typing import Any, List, Optional -from pydantic import ConfigDict +from redisvl.utils.utils import deprecated_argument, deprecated_class +from redisvl.utils.vectorize.custom import CustomVectorizer -if TYPE_CHECKING: - from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache -from redisvl.utils.vectorize.base import BaseVectorizer +@deprecated_class( + name="CustomTextVectorizer", replacement="Use CustomVectorizer instead." +) +class CustomTextVectorizer(CustomVectorizer): + """A backwards-compatible alias for CustomVectorizer.""" + @deprecated_argument("text", "content") + def embed(self, content: Any = "", text: Any = "", **kwargs) -> List[float]: + """Generate a vector embedding for a single input using the custom function. -def _check_vector(result: list, method_name: str) -> None: - """ - Validates the structure of returned embeddings. - - - For methods named "*_many", expects a list of lists of floats. - - For single methods, expects a list of floats. - - Raises: - ValueError: If the embeddings do not match the expected structure. - """ - if method_name.endswith("_many"): - # embed_many / aembed_many → list of lists - if not isinstance(result, list) or not result: - raise ValueError(f"{method_name} must return a non-empty list of lists.") - if not isinstance(result[0], list) or not result[0]: - raise ValueError(f"{method_name} must return a list of non-empty lists.") - if not isinstance(result[0][0], float): - raise ValueError(f"{method_name} must return a list of lists of floats.") - else: - # embed / aembed → a single list of floats - if not isinstance(result, list) or not result: - raise ValueError(f"{method_name} must return a non-empty list.") - if not isinstance(result[0], float): - raise ValueError(f"{method_name} must return a list of floats.") - - -class CustomTextVectorizer(BaseVectorizer): - """The CustomTextVectorizer class wraps user-defined embedding methods to create - embeddings for text data. - - This vectorizer is designed to accept a provided callable text vectorizer and - provides a class definition to allow for compatibility with RedisVL. - The vectorizer may support both synchronous and asynchronous operations which - allows for batch processing of texts, but at a minimum only synchronous embedding - is required to satisfy the 'embed()' method. - - You can optionally enable caching to improve performance when generating - embeddings for repeated text inputs. - - .. code-block:: python - - # Basic usage with a custom embedding function - vectorizer = CustomTextVectorizer( - embed = my_vectorizer.generate_embedding - ) - embedding = vectorizer.embed("Hello, world!") - - # With caching enabled - from redisvl.extensions.cache.embeddings import EmbeddingsCache - cache = EmbeddingsCache(name="my_embeddings_cache") - - vectorizer = CustomTextVectorizer( - embed=my_vectorizer.generate_embedding, - cache=cache - ) - - # First call will compute and cache the embedding - embedding1 = vectorizer.embed("Hello, world!") - - # Second call will retrieve from cache - embedding2 = vectorizer.embed("Hello, world!") - - # Asynchronous batch embedding of multiple texts - embeddings = await vectorizer.aembed_many( - ["Hello, world!", "How are you?"], - batch_size=2 - ) - - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - def __init__( - self, - embed: Callable, - embed_many: Optional[Callable] = None, - aembed: Optional[Callable] = None, - aembed_many: Optional[Callable] = None, - dtype: str = "float32", - cache: Optional["EmbeddingsCache"] = None, - ): - """Initialize the Custom vectorizer. - - Args: - embed (Callable): a Callable function that accepts a string object and returns a list of floats. - embed_many (Optional[Callable]): a Callable function that accepts a list of string objects and returns a list containing lists of floats. Defaults to None. - aembed (Optional[Callable]): an asynchronous Callable function that accepts a string object and returns a lists of floats. Defaults to None. - aembed_many (Optional[Callable]): an asynchronous Callable function that accepts a list of string objects and returns a list containing lists of floats. Defaults to None. - dtype (str): the default datatype to use when embedding text as byte arrays. - Used when setting `as_buffer=True` in calls to embed() and embed_many(). - Defaults to 'float32'. - cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for - better performance with repeated texts. Defaults to None. - - Raises: - ValueError: if embedding validation fails. - """ - # First, determine the dimensions - try: - test_result = embed("dimension test") - _check_vector(test_result, "embed") - dims = len(test_result) - except Exception as e: - raise ValueError(f"Failed to validate embed method: {e}") - - # Initialize parent with known information - super().__init__(model="custom", dtype=dtype, dims=dims, cache=cache) - - # Now setup the functions and validation flags - self._setup_functions(embed, embed_many, aembed, aembed_many) - - def _setup_functions(self, embed, embed_many, aembed, aembed_many): - """Setup the user-provided embedding functions.""" - self._embed_func = embed - self._embed_func_many = embed_many - self._aembed_func = aembed - self._aembed_func_many = aembed_many - - # Initialize validation flags - self._aembed_validated = False - self._aembed_many_validated = False - - # Validate the other functions if provided - self._validate_optional_funcs() - - @property - def type(self) -> str: - return "custom" - - def _validate_optional_funcs(self) -> None: + Deprecated: Use `CustomVectorizer.embed` instead. """ - Optionally validate the other user-provided functions if they exist. + content = content or text + return super().embed(content=content, **kwargs) - Raises: - ValueError: If any provided function produces invalid results. - """ - # Check embed_many if provided - if self._embed_func_many: - try: - test_batch = self._embed_func_many(["dimension test (many)"]) - _check_vector(test_batch, "embed_many") - except Exception as e: - raise ValueError(f"Invalid embed_many function: {e}") - - def _embed(self, text: str, **kwargs) -> List[float]: - """Generate a vector embedding for a single text using the provided user function. - - Args: - text: Text to embed - **kwargs: Additional parameters to pass to the user function - - Returns: - List[float]: Vector embedding as a list of floats - - Raises: - TypeError: If text is not a string - ValueError: If embedding fails - """ - if not isinstance(text, str): - raise TypeError("Must pass in a str value to embed.") - - try: - result = self._embed_func(text, **kwargs) - return result - except Exception as e: - raise ValueError(f"Embedding text failed: {e}") - - def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + @deprecated_argument("texts", "contents") + def embed_many( + self, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, + **kwargs, ) -> List[List[float]]: - """Generate vector embeddings for a batch of texts using the provided user function. - - Args: - texts: List of texts to embed - batch_size: Number of texts to process in each batch - **kwargs: Additional parameters to pass to the user function + """Generate vector embeddings for a batch of inputs using the custom function. - Returns: - List[List[float]]: List of vector embeddings as lists of floats - - Raises: - TypeError: If texts is not a list of strings - ValueError: If embedding fails + Deprecated: Use `CustomVectorizer.embed_many` instead. """ - if not isinstance(texts, list): - raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): - raise TypeError("Must pass in a list of str values to embed.") - - if not self._embed_func_many: - # Fallback: Use _embed for each text if no batch function provided - return [self._embed(text, **kwargs) for text in texts] - - try: - results = self._embed_func_many(texts, **kwargs) - return results - except Exception as e: - raise ValueError(f"Embedding texts failed: {e}") + contents = contents or texts + return super().embed_many(contents=contents, **kwargs) - async def _aembed(self, text: str, **kwargs) -> List[float]: - """Asynchronously generate a vector embedding for a single text. + @deprecated_argument("text", "content") + async def aembed(self, content: Any = "", text: Any = "", **kwargs) -> List[float]: + """Asynchronously generate a vector embedding for a single input using the custom function. - Args: - text: Text to embed - **kwargs: Additional parameters to pass to the user async function - - Returns: - List[float]: Vector embedding as a list of floats - - Raises: - TypeError: If text is not a string - NotImplementedError: If no aembed function was provided - ValueError: If embedding fails + Deprecated: Use `CustomVectorizer.aembed` instead. """ - if not isinstance(text, str): - raise TypeError("Must pass in a str value to embed.") - - if not self._aembed_func: - return self._embed(text, **kwargs) - - try: - result = await self._aembed_func(text, **kwargs) + content = content or text + return await super().aembed(content=content, **kwargs) - # Validate result on first call - if not self._aembed_validated: - _check_vector(result, "aembed") - self._aembed_validated = True - - return result - except Exception as e: - raise ValueError(f"Embedding text failed: {e}") - - async def _aembed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + @deprecated_argument("texts", "contents") + async def aembed_many( + self, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, + **kwargs, ) -> List[List[float]]: - """Asynchronously generate vector embeddings for a batch of texts. + """Asynchronously generate vector embeddings for a batch of inputs using the custom function. - Args: - texts: List of texts to embed - batch_size: Number of texts to process in each batch - **kwargs: Additional parameters to pass to the user async function - - Returns: - List[List[float]]: List of vector embeddings as lists of floats - - Raises: - TypeError: If texts is not a list of strings - NotImplementedError: If no aembed_many function was provided - ValueError: If embedding fails + Deprecated: Use `CustomVectorizer.aembed_many` instead. """ - if not isinstance(texts, list): - raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): - raise TypeError("Must pass in a list of str values to embed.") - - if not self._aembed_func_many: - return self._embed_many(texts, batch_size, **kwargs) - - try: - results = await self._aembed_func_many(texts, **kwargs) - - # Validate result on first call - if not self._aembed_many_validated: - _check_vector(results, "aembed_many") - self._aembed_many_validated = True - - return results - except Exception as e: - raise ValueError(f"Embedding texts failed: {e}") + contents = contents or texts + return await super().aembed_many(contents=contents, **kwargs) diff --git a/redisvl/utils/vectorize/text/huggingface.py b/redisvl/utils/vectorize/text/huggingface.py index ac13b08e..adff4df7 100644 --- a/redisvl/utils/vectorize/text/huggingface.py +++ b/redisvl/utils/vectorize/text/huggingface.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union +from typing import TYPE_CHECKING, Any, List, Optional from pydantic.v1 import PrivateAttr @@ -25,6 +25,12 @@ class HFTextVectorizer(BaseVectorizer): trained on a variety of datasets and tasks, ensuring versatility and robust performance across different embedding needs. + Note: + Some multimodal models can make use of sentence-transformers by passing + PIL Image objects in place of strings (e.g. CLIP). To enable those use + cases, this class follows the SentenceTransformer convention of hinting + that it expects string inputs, but never enforcing it. + Requirements: - The `sentence-transformers` library must be installed with pip. @@ -54,6 +60,13 @@ class HFTextVectorizer(BaseVectorizer): ["Hello, world!", "How are you?"], batch_size=2 ) + + # Multimodal usage + from PIL import Image + vectorizer = HFTextVectorizer(model="sentence-transformers/clip-ViT-L-14") + embeddings1 = vectorizer.embed("Hello, world!") + embeddings2 = vectorizer.embed(Image.open("path/to/your/image.jpg")) + """ _client: Any = PrivateAttr() @@ -112,55 +125,54 @@ def _set_model_dims(self): raise ValueError(f"Error setting embedding model dimensions: {str(e)}") return len(embedding) - def _embed(self, text: str, **kwargs) -> List[float]: + @deprecated_argument("text", "content") + def _embed(self, content: str = "", text: str = "", **kwargs) -> List[float]: """Generate a vector embedding for a single text using the Hugging Face model. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional model-specific parameters Returns: List[float]: Vector embedding as a list of floats - - Raises: - TypeError: If the input is not a string """ - if not isinstance(text, str): - raise TypeError("Must pass in a str value to embed.") - + content = content or text if "show_progress_bar" not in kwargs: # disable annoying tqdm by default kwargs["show_progress_bar"] = False - embedding = self._client.encode([text], **kwargs)[0] + embedding = self._client.encode([content], **kwargs)[0] return embedding.tolist() + @deprecated_argument("texts", "contents") def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: """Generate vector embeddings for a batch of texts using the Hugging Face model. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each batch **kwargs: Additional model-specific parameters Returns: List[List[float]]: List of vector embeddings as lists of floats - - Raises: - TypeError: If the input is not a list of strings """ - if not isinstance(texts, list): - raise TypeError("Must pass in a list of str values to embed.") - if len(texts) > 0 and not isinstance(texts[0], str): - raise TypeError("Must pass in a list of str values to embed.") + contents = contents or texts + if not isinstance(contents, list): + raise TypeError("Must pass in a list of values to embed.") if "show_progress_bar" not in kwargs: # disable annoying tqdm by default kwargs["show_progress_bar"] = False embeddings: List = [] - for batch in self.batchify(texts, batch_size, None): + for batch in self.batchify(contents, batch_size, None): batch_embeddings = self._client.encode(batch, **kwargs) embeddings.extend([embedding.tolist() for embedding in batch_embeddings]) return embeddings diff --git a/redisvl/utils/vectorize/text/mistral.py b/redisvl/utils/vectorize/text/mistral.py index 576acd87..5ef9e0a4 100644 --- a/redisvl/utils/vectorize/text/mistral.py +++ b/redisvl/utils/vectorize/text/mistral.py @@ -163,50 +163,59 @@ def _set_model_dims(self) -> int: # fall back (TODO get more specific) raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + @deprecated_argument("text", "content") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) - def _embed(self, text: str, **kwargs) -> List[float]: + def _embed(self, content: str = "", text: str = "", **kwargs) -> List[float]: """ Generate a vector embedding for a single text using the MistralAI API. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional parameters to pass to the MistralAI API Returns: List[float]: Vector embedding as a list of floats Raises: - TypeError: If text is not a string + TypeError: If content is not a string ValueError: If embedding fails """ - if not isinstance(text, str): + content = content or text + if not isinstance(content, str): raise TypeError("Must pass in a str value to embed.") try: result = self._client.embeddings.create( - model=self.model, inputs=[text], **kwargs + model=self.model, inputs=[content], **kwargs ) return result.data[0].embedding # type: ignore except Exception as e: raise ValueError(f"Embedding text failed: {e}") + @deprecated_argument("texts", "contents") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: """ Generate vector embeddings for a batch of texts using the MistralAI API. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each API call **kwargs: Additional parameters to pass to the MistralAI API @@ -214,17 +223,18 @@ def _embed_many( List[List[float]]: List of vector embeddings as lists of floats Raises: - TypeError: If texts is not a list of strings + TypeError: If contents is not a list of strings ValueError: If embedding fails """ - if not isinstance(texts, list): + contents = contents or texts + if not isinstance(contents, list): raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): + if contents and not isinstance(contents[0], str): raise TypeError("Must pass in a list of str values to embed.") try: embeddings: List = [] - for batch in self.batchify(texts, batch_size): + for batch in self.batchify(contents, batch_size): response = self._client.embeddings.create( model=self.model, inputs=batch, **kwargs ) @@ -233,50 +243,59 @@ def _embed_many( except Exception as e: raise ValueError(f"Embedding texts failed: {e}") + @deprecated_argument("text", "content") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) - async def _aembed(self, text: str, **kwargs) -> List[float]: + async def _aembed(self, content: str = "", text: str = "", **kwargs) -> List[float]: """ Asynchronously generate a vector embedding for a single text using the MistralAI API. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional parameters to pass to the MistralAI API Returns: List[float]: Vector embedding as a list of floats Raises: - TypeError: If text is not a string + TypeError: If `content` is not a string ValueError: If embedding fails """ - if not isinstance(text, str): + content = content or text + if not isinstance(content, str): raise TypeError("Must pass in a str value to embed.") try: result = await self._client.embeddings.create_async( - model=self.model, inputs=[text], **kwargs + model=self.model, inputs=[content], **kwargs ) return result.data[0].embedding # type: ignore except Exception as e: - raise ValueError(f"Embedding text failed: {e}") + raise ValueError(f"Embedding content failed: {e}") + @deprecated_argument("texts", "contents") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) async def _aembed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: """ Asynchronously generate vector embeddings for a batch of texts using the MistralAI API. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each API call **kwargs: Additional parameters to pass to the MistralAI API @@ -287,21 +306,22 @@ async def _aembed_many( TypeError: If texts is not a list of strings ValueError: If embedding fails """ - if not isinstance(texts, list): + contents = contents or texts + if not isinstance(contents, list): raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): + if contents and not isinstance(contents[0], str): raise TypeError("Must pass in a list of str values to embed.") try: embeddings: List = [] - for batch in self.batchify(texts, batch_size): + for batch in self.batchify(contents, batch_size): response = await self._client.embeddings.create_async( model=self.model, inputs=batch, **kwargs ) embeddings.extend([r.embedding for r in response.data]) return embeddings except Exception as e: - raise ValueError(f"Embedding texts failed: {e}") + raise ValueError(f"Embedding contents failed: {e}") @property def type(self) -> str: diff --git a/redisvl/utils/vectorize/text/openai.py b/redisvl/utils/vectorize/text/openai.py index 6ff1f99c..ada9e5a5 100644 --- a/redisvl/utils/vectorize/text/openai.py +++ b/redisvl/utils/vectorize/text/openai.py @@ -162,16 +162,18 @@ def _set_model_dims(self) -> int: # fall back (TODO get more specific) raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + @deprecated_argument("text", "content") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) - def _embed(self, text: str, **kwargs) -> List[float]: + def _embed(self, content: str = "", text: str = "", **kwargs) -> List[float]: """Generate a vector embedding for a single text using the OpenAI API. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional parameters to pass to the OpenAI API Returns: @@ -181,29 +183,36 @@ def _embed(self, text: str, **kwargs) -> List[float]: TypeError: If text is not a string ValueError: If embedding fails """ - if not isinstance(text, str): + content = content or text + if not isinstance(content, str): raise TypeError("Must pass in a str value to embed.") try: result = self._client.embeddings.create( - input=[text], model=self.model, **kwargs + input=[content], model=self.model, **kwargs ) return result.data[0].embedding except Exception as e: raise ValueError(f"Embedding text failed: {e}") + @deprecated_argument("texts", "contents") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: """Generate vector embeddings for a batch of texts using the OpenAI API. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each API call **kwargs: Additional parameters to pass to the OpenAI API @@ -211,16 +220,17 @@ def _embed_many( List[List[float]]: List of vector embeddings as lists of floats Raises: - TypeError: If texts is not a list of strings + TypeError: If contents is not a list of strings ValueError: If embedding fails """ - if not isinstance(texts, list): + contents = contents or texts + if not isinstance(contents, list): raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): + if contents and not isinstance(contents[0], str): raise TypeError("Must pass in a list of str values to embed.") embeddings: List = [] - for batch in self.batchify(texts, batch_size): + for batch in self.batchify(contents, batch_size): try: response = self._client.embeddings.create( input=batch, model=self.model, **kwargs @@ -230,48 +240,57 @@ def _embed_many( raise ValueError(f"Embedding texts failed: {e}") return embeddings + @deprecated_argument("text", "content") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) - async def _aembed(self, text: str, **kwargs) -> List[float]: + async def _aembed(self, content: str = "", text: str = "", **kwargs) -> List[float]: """Asynchronously generate a vector embedding for a single text using the OpenAI API. Args: - text: Text to embed + content: Text to embed + text: Text to embed (deprecated - use `content` instead) **kwargs: Additional parameters to pass to the OpenAI API Returns: List[float]: Vector embedding as a list of floats Raises: - TypeError: If text is not a string + TypeError: If content is not a string ValueError: If embedding fails """ - if not isinstance(text, str): + content = content or text + if not isinstance(content, str): raise TypeError("Must pass in a str value to embed.") try: result = await self._aclient.embeddings.create( - input=[text], model=self.model, **kwargs + input=[content], model=self.model, **kwargs ) return result.data[0].embedding except Exception as e: raise ValueError(f"Embedding text failed: {e}") + @deprecated_argument("texts", "contents") @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(TypeError), ) async def _aembed_many( - self, texts: List[str], batch_size: int = 10, **kwargs + self, + contents: Optional[List[str]] = None, + texts: Optional[List[str]] = None, + batch_size: int = 10, + **kwargs, ) -> List[List[float]]: """Asynchronously generate vector embeddings for a batch of texts using the OpenAI API. Args: - texts: List of texts to embed + contents: List of texts to embed + texts: List of texts to embed (deprecated - use `contents` instead) batch_size: Number of texts to process in each API call **kwargs: Additional parameters to pass to the OpenAI API @@ -279,16 +298,17 @@ async def _aembed_many( List[List[float]]: List of vector embeddings as lists of floats Raises: - TypeError: If texts is not a list of strings + TypeError: If contents is not a list of strings ValueError: If embedding fails """ - if not isinstance(texts, list): + contents = contents or texts + if not isinstance(contents, list): raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): + if contents and not isinstance(contents[0], str): raise TypeError("Must pass in a list of str values to embed.") embeddings: List = [] - for batch in self.batchify(texts, batch_size): + for batch in self.batchify(contents, batch_size): try: response = await self._aclient.embeddings.create( input=batch, model=self.model, **kwargs diff --git a/redisvl/utils/vectorize/text/vertexai.py b/redisvl/utils/vectorize/text/vertexai.py index f20f4866..b96aafc1 100644 --- a/redisvl/utils/vectorize/text/vertexai.py +++ b/redisvl/utils/vectorize/text/vertexai.py @@ -1,251 +1,34 @@ -import os -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import Any, List, Optional -from pydantic import ConfigDict -from tenacity import retry, stop_after_attempt, wait_random_exponential -from tenacity.retry import retry_if_not_exception_type +from redisvl.utils.utils import deprecated_argument, deprecated_class +from redisvl.utils.vectorize.vertexai import VertexAIVectorizer -if TYPE_CHECKING: - from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache -from redisvl.utils.utils import deprecated_argument -from redisvl.utils.vectorize.base import BaseVectorizer +@deprecated_class( + name="VertexAITextVectorizer", replacement="Use VertexAIVectorizer instead." +) +class VertexAITextVectorizer(VertexAIVectorizer): + """A backwards-compatible alias for VertexAIVectorizer.""" + @deprecated_argument("text", "content") + def embed(self, content: str = "", text: Any = "", **kwargs) -> List[float]: + """Generate a vector embedding for a single input using the VertexAI API. -class VertexAITextVectorizer(BaseVectorizer): - """The VertexAITextVectorizer uses Google's VertexAI Palm 2 embedding model - API to create text embeddings. - - This vectorizer is tailored for use in - environments where integration with Google Cloud Platform (GCP) services is - a key requirement. - - Utilizing this vectorizer requires an active GCP project and location - (region), along with appropriate application credentials. These can be - provided through the `api_config` dictionary or set the GOOGLE_APPLICATION_CREDENTIALS - env var. Additionally, the vertexai python client must be - installed with `pip install google-cloud-aiplatform>=1.26`. - - You can optionally enable caching to improve performance when generating - embeddings for repeated text inputs. - - .. code-block:: python - - # Basic usage - vectorizer = VertexAITextVectorizer( - model="textembedding-gecko", - api_config={ - "project_id": "your_gcp_project_id", # OR set GCP_PROJECT_ID - "location": "your_gcp_location", # OR set GCP_LOCATION - }) - embedding = vectorizer.embed("Hello, world!") - - # With caching enabled - from redisvl.extensions.cache.embeddings import EmbeddingsCache - cache = EmbeddingsCache(name="vertexai_embeddings_cache") - - vectorizer = VertexAITextVectorizer( - model="textembedding-gecko", - api_config={ - "project_id": "your_gcp_project_id", - "location": "your_gcp_location", - }, - cache=cache - ) - - # First call will compute and cache the embedding - embedding1 = vectorizer.embed("Hello, world!") - - # Second call will retrieve from cache - embedding2 = vectorizer.embed("Hello, world!") - - # Batch embedding of multiple texts - embeddings = vectorizer.embed_many( - ["Hello, world!", "Goodbye, world!"], - batch_size=2 - ) - - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) + Deprecated: Use `VertexAIVectorizer.embed` instead. + """ + content = content or text + return super().embed(content=content, **kwargs) - def __init__( + @deprecated_argument("texts", "contents") + def embed_many( self, - model: str = "textembedding-gecko", - api_config: Optional[Dict] = None, - dtype: str = "float32", - cache: Optional["EmbeddingsCache"] = None, + contents: Optional[List[str]] = None, + texts: Optional[List[Any]] = None, **kwargs, - ): - """Initialize the VertexAI vectorizer. - - Args: - model (str): Model to use for embedding. Defaults to - 'textembedding-gecko'. - api_config (Optional[Dict], optional): Dictionary containing the - API config details. Defaults to None. - dtype (str): the default datatype to use when embedding text as byte arrays. - Used when setting `as_buffer=True` in calls to embed() and embed_many(). - Defaults to 'float32'. - cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for - better performance with repeated texts. Defaults to None. - - Raises: - ImportError: If the google-cloud-aiplatform library is not installed. - ValueError: If the API key is not provided. - ValueError: If an invalid dtype is provided. - """ - super().__init__(model=model, dtype=dtype, cache=cache) - # Initialize client and set up the model - self._setup(api_config, **kwargs) - - def _setup(self, api_config: Optional[Dict], **kwargs): - """Set up the VertexAI client and determine the embedding dimensions.""" - # Initialize client - self._initialize_client(api_config, **kwargs) - # Set model dimensions after initialization - self.dims = self._set_model_dims() - - def _initialize_client(self, api_config: Optional[Dict], **kwargs): - """ - Setup the VertexAI client using the provided config options or - environment variables. - - Args: - api_config: Dictionary with GCP configuration options - **kwargs: Additional arguments for initialization - - Raises: - ImportError: If the google-cloud-aiplatform library is not installed - ValueError: If required parameters are not provided - """ - # Fetch the project_id and location from api_config or environment variables - project_id = ( - api_config.get("project_id") if api_config else os.getenv("GCP_PROJECT_ID") - ) - location = ( - api_config.get("location") if api_config else os.getenv("GCP_LOCATION") - ) - - if not project_id: - raise ValueError( - "Missing project_id. " - "Provide the id in the api_config with key 'project_id' " - "or set the GCP_PROJECT_ID environment variable." - ) - - if not location: - raise ValueError( - "Missing location. " - "Provide the location (region) in the api_config with key 'location' " - "or set the GCP_LOCATION environment variable." - ) - - # Check for credentials - credentials = api_config.get("credentials") if api_config else None - - try: - import vertexai - from vertexai.language_models import TextEmbeddingModel - - vertexai.init( - project=project_id, location=location, credentials=credentials - ) - except ImportError: - raise ImportError( - "VertexAI vectorizer requires the google-cloud-aiplatform library. " - "Please install with `pip install google-cloud-aiplatform>=1.26`" - ) - - # Store client as a regular attribute instead of PrivateAttr - self._client = TextEmbeddingModel.from_pretrained(self.model) - - def _set_model_dims(self) -> int: - """ - Determine the dimensionality of the embedding model by making a test call. - - Returns: - int: Dimensionality of the embedding model - - Raises: - ValueError: If embedding dimensions cannot be determined - """ - try: - # Call the protected _embed method to avoid caching this test embedding - embedding = self._embed("dimension check") - return len(embedding) - except (KeyError, IndexError) as ke: - raise ValueError(f"Unexpected response from the VertexAI API: {str(ke)}") - except Exception as e: # pylint: disable=broad-except - # fall back (TODO get more specific) - raise ValueError(f"Error setting embedding model dimensions: {str(e)}") - - @retry( - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), - retry=retry_if_not_exception_type(TypeError), - ) - def _embed(self, text: str, **kwargs) -> List[float]: - """ - Generate a vector embedding for a single text using the VertexAI API. - - Args: - text: Text to embed - **kwargs: Additional parameters to pass to the VertexAI API - - Returns: - List[float]: Vector embedding as a list of floats - - Raises: - TypeError: If text is not a string - ValueError: If embedding fails - """ - if not isinstance(text, str): - raise TypeError("Must pass in a str value to embed.") - - try: - result = self._client.get_embeddings([text], **kwargs) - return result[0].values - except Exception as e: - raise ValueError(f"Embedding text failed: {e}") - - @retry( - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), - retry=retry_if_not_exception_type(TypeError), - ) - def _embed_many( - self, texts: List[str], batch_size: int = 10, **kwargs ) -> List[List[float]]: - """ - Generate vector embeddings for a batch of texts using the VertexAI API. - - Args: - texts: List of texts to embed - batch_size: Number of texts to process in each API call - **kwargs: Additional parameters to pass to the VertexAI API - - Returns: - List[List[float]]: List of vector embeddings as lists of floats + """Generate vector embeddings for a batch of inputs using the VertexAI API. - Raises: - TypeError: If texts is not a list of strings - ValueError: If embedding fails + Deprecated: Use `VertexAIVectorizer.embed_many` instead. """ - if not isinstance(texts, list): - raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): - raise TypeError("Must pass in a list of str values to embed.") - - try: - embeddings: List = [] - for batch in self.batchify(texts, batch_size): - response = self._client.get_embeddings(batch, **kwargs) - embeddings.extend([r.values for r in response]) - return embeddings - except Exception as e: - raise ValueError(f"Embedding texts failed: {e}") - - @property - def type(self) -> str: - return "vertexai" + contents = contents or texts + return super().embed_many(contents=contents, **kwargs) diff --git a/redisvl/utils/vectorize/text/voyageai.py b/redisvl/utils/vectorize/text/voyageai.py index 1936bd97..465d27b8 100644 --- a/redisvl/utils/vectorize/text/voyageai.py +++ b/redisvl/utils/vectorize/text/voyageai.py @@ -1,353 +1,57 @@ -import os -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import Any, List, Optional -from pydantic import ConfigDict -from tenacity import retry, stop_after_attempt, wait_random_exponential -from tenacity.retry import retry_if_not_exception_type +from redisvl.utils.utils import deprecated_argument, deprecated_class +from redisvl.utils.vectorize.voyageai import VoyageAIVectorizer -if TYPE_CHECKING: - from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache -from redisvl.utils.utils import deprecated_argument -from redisvl.utils.vectorize.base import BaseVectorizer +@deprecated_class( + name="VoyageAITextVectorizer", replacement="Use VoyageAIVectorizer instead." +) +class VoyageAITextVectorizer(VoyageAIVectorizer): + """A backwards-compatible alias for VoyageAIVectorizer.""" -# ignore that voyageai isn't imported -# mypy: disable-error-code="name-defined" + @deprecated_argument("text", "content") + def embed(self, content: Any = "", text: Any = "", **kwargs) -> List[float]: + """Generate a vector embedding for a single text using the VoyageAI API. + Deprecated: Use `VoyageAIVectorizer.embed` instead. + """ + content = content or text + return super().embed(content=content, **kwargs) -class VoyageAITextVectorizer(BaseVectorizer): - """The VoyageAITextVectorizer class utilizes VoyageAI's API to generate - embeddings for text data. - - This vectorizer is designed to interact with VoyageAI's /embed API, - requiring an API key for authentication. The key can be provided - directly in the `api_config` dictionary or through the `VOYAGE_API_KEY` - environment variable. User must obtain an API key from VoyageAI's website - (https://dash.voyageai.com/). Additionally, the `voyageai` python - client must be installed with `pip install voyageai`. - - The vectorizer supports both synchronous and asynchronous operations, allows for batch - processing of texts and flexibility in handling preprocessing tasks. - - You can optionally enable caching to improve performance when generating - embeddings for repeated text inputs. - - .. code-block:: python - - from redisvl.utils.vectorize import VoyageAITextVectorizer - - # Basic usage - vectorizer = VoyageAITextVectorizer( - model="voyage-large-2", - api_config={"api_key": "your-voyageai-api-key"} # OR set VOYAGE_API_KEY in your env - ) - query_embedding = vectorizer.embed( - text="your input query text here", - input_type="query" - ) - doc_embeddings = vectorizer.embed_many( - texts=["your document text", "more document text"], - input_type="document" - ) - - # With caching enabled - from redisvl.extensions.cache.embeddings import EmbeddingsCache - cache = EmbeddingsCache(name="voyageai_embeddings_cache") - - vectorizer = VoyageAITextVectorizer( - model="voyage-large-2", - api_config={"api_key": "your-voyageai-api-key"}, - cache=cache - ) - - # First call will compute and cache the embedding - embedding1 = vectorizer.embed( - text="your input query text here", - input_type="query" - ) - - # Second call will retrieve from cache - embedding2 = vectorizer.embed( - text="your input query text here", - input_type="query" - ) - - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - def __init__( + @deprecated_argument("texts", "contents") + def embed_many( self, - model: str = "voyage-large-2", - api_config: Optional[Dict] = None, - dtype: str = "float32", - cache: Optional["EmbeddingsCache"] = None, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, **kwargs, - ): - """Initialize the VoyageAI vectorizer. - - Visit https://docs.voyageai.com/docs/embeddings to learn about embeddings and check the available models. - - Args: - model (str): Model to use for embedding. Defaults to "voyage-large-2". - api_config (Optional[Dict], optional): Dictionary containing the API key. - Defaults to None. - dtype (str): the default datatype to use when embedding text as byte arrays. - Used when setting `as_buffer=True` in calls to embed() and embed_many(). - Defaults to 'float32'. - cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for - better performance with repeated texts. Defaults to None. - - Raises: - ImportError: If the voyageai library is not installed. - ValueError: If the API key is not provided. - - """ - super().__init__(model=model, dtype=dtype, cache=cache) - # Initialize client and set up the model - self._setup(api_config, **kwargs) - - def _setup(self, api_config: Optional[Dict], **kwargs): - """Set up the VoyageAI client and determine the embedding dimensions.""" - # Initialize client - self._initialize_client(api_config, **kwargs) - # Set model dimensions after initialization - self.dims = self._set_model_dims() - - def _initialize_client(self, api_config: Optional[Dict], **kwargs): - """ - Setup the VoyageAI clients using the provided API key or an - environment variable. - - Args: - api_config: Dictionary with API configuration options - **kwargs: Additional arguments to pass to VoyageAI clients - - Raises: - ImportError: If the voyageai library is not installed - ValueError: If no API key is provided - """ - if api_config is None: - api_config = {} - - # Dynamic import of the voyageai module - try: - from voyageai import AsyncClient, Client - except ImportError: - raise ImportError( - "VoyageAI vectorizer requires the voyageai library. " - "Please install with `pip install voyageai`" - ) - - # Fetch the API key from api_config or environment variable - api_key = ( - api_config.get("api_key") if api_config else os.getenv("VOYAGE_API_KEY") - ) - if not api_key: - raise ValueError( - "VoyageAI API key is required. " - "Provide it in api_config or set the VOYAGE_API_KEY environment variable." - ) - - self._client = Client(api_key=api_key, **kwargs) - self._aclient = AsyncClient(api_key=api_key, **kwargs) - - def _set_model_dims(self) -> int: - """ - Determine the dimensionality of the embedding model by making a test call. - - Returns: - int: Dimensionality of the embedding model - - Raises: - ValueError: If embedding dimensions cannot be determined - """ - try: - # Call the protected _embed method to avoid caching this test embedding - embedding = self._embed("dimension check", input_type="document") - return len(embedding) - except (KeyError, IndexError) as ke: - raise ValueError(f"Unexpected response from the VoyageAI API: {str(ke)}") - except Exception as e: # pylint: disable=broad-except - # fall back (TODO get more specific) - raise ValueError(f"Error setting embedding model dimensions: {str(e)}") - - def _get_batch_size(self) -> int: - """ - Determine the appropriate batch size based on the model being used. - - Returns: - int: Recommended batch size for the current model - """ - if self.model in ["voyage-2", "voyage-02"]: - return 72 - elif self.model in ["voyage-3-lite", "voyage-3.5-lite"]: - return 30 - elif self.model in ["voyage-3", "voyage-3.5"]: - return 10 - else: - return 7 # Default for other models - - def _validate_input( - self, texts: List[str], input_type: Optional[str], truncation: Optional[bool] - ): - """ - Validate the inputs to the embedding methods. - - Args: - texts: List of texts to embed - input_type: Type of input (document or query) - truncation: Whether to truncate long texts - - Raises: - TypeError: If inputs are invalid - """ - if not isinstance(texts, list): - raise TypeError("Must pass in a list of str values to embed.") - if texts and not isinstance(texts[0], str): - raise TypeError("Must pass in a list of str values to embed.") - if input_type is not None and input_type not in ["document", "query"]: - raise TypeError( - "Must pass in a allowed value for voyageai embedding input_type. " - "See https://docs.voyageai.com/docs/embeddings." - ) - if truncation is not None and not isinstance(truncation, bool): - raise TypeError("Truncation (optional) parameter is a bool.") - - def _embed(self, text: str, **kwargs) -> List[float]: - """ - Generate a vector embedding for a single text using the VoyageAI API. - - Args: - text: Text to embed - **kwargs: Additional parameters to pass to the VoyageAI API - - Returns: - List[float]: Vector embedding as a list of floats - - Raises: - TypeError: If text is not a string or parameters are invalid - ValueError: If embedding fails - """ - # Simply call _embed_many with a single text and return the first result - result = self._embed_many([text], **kwargs) - return result[0] - - @retry( - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), - retry=retry_if_not_exception_type(TypeError), - ) - def _embed_many( - self, texts: List[str], batch_size: Optional[int] = None, **kwargs ) -> List[List[float]]: - """ - Generate vector embeddings for a batch of texts using the VoyageAI API. - - Args: - texts: List of texts to embed - batch_size: Number of texts to process in each API call - **kwargs: Additional parameters to pass to the VoyageAI API + """Generate vector embeddings for a batch of texts using the VoyageAI API. - Returns: - List[List[float]]: List of vector embeddings as lists of floats - - Raises: - TypeError: If texts is not a list of strings or parameters are invalid - ValueError: If embedding fails - """ - input_type = kwargs.pop("input_type", None) - truncation = kwargs.pop("truncation", None) - - # Validate inputs - self._validate_input(texts, input_type, truncation) - - # Determine batch size if not provided - if batch_size is None: - batch_size = self._get_batch_size() - - try: - embeddings: List = [] - for batch in self.batchify(texts, batch_size): - response = self._client.embed( - texts=batch, - model=self.model, - input_type=input_type, - truncation=truncation, - **kwargs, - ) - embeddings.extend(response.embeddings) - return embeddings - except Exception as e: - raise ValueError(f"Embedding texts failed: {e}") - - async def _aembed(self, text: str, **kwargs) -> List[float]: + Deprecated: Use `VoyageAIVectorizer.embed_many` instead. """ - Asynchronously generate a vector embedding for a single text using the VoyageAI API. + contents = contents or texts + return super().embed_many(contents=contents, **kwargs) - Args: - text: Text to embed - **kwargs: Additional parameters to pass to the VoyageAI API + @deprecated_argument("text", "content") + async def aembed(self, content: Any = "", text: Any = "", **kwargs) -> List[float]: + """Asynchronously generate a vector embedding for a single text using the VoyageAI API. - Returns: - List[float]: Vector embedding as a list of floats - - Raises: - TypeError: If text is not a string or parameters are invalid - ValueError: If embedding fails + Deprecated: Use `VoyageAIVectorizer.aembed` instead. """ - # Simply call _aembed_many with a single text and return the first result - result = await self._aembed_many([text], **kwargs) - return result[0] + content = content or text + return await super().aembed(content=content, **kwargs) - @retry( - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), - retry=retry_if_not_exception_type(TypeError), - ) - async def _aembed_many( - self, texts: List[str], batch_size: Optional[int] = None, **kwargs + @deprecated_argument("texts", "contents") + async def aembed_many( + self, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, + **kwargs, ) -> List[List[float]]: - """ - Asynchronously generate vector embeddings for a batch of texts using the VoyageAI API. - - Args: - texts: List of texts to embed - batch_size: Number of texts to process in each API call - **kwargs: Additional parameters to pass to the VoyageAI API - - Returns: - List[List[float]]: List of vector embeddings as lists of floats + """Asynchronously generate vector embeddings for a batch of texts using the VoyageAI API. - Raises: - TypeError: If texts is not a list of strings or parameters are invalid - ValueError: If embedding fails + Deprecated: Use `VoyageAIVectorizer.aembed_many` instead. """ - input_type = kwargs.pop("input_type", None) - truncation = kwargs.pop("truncation", None) - - # Validate inputs - self._validate_input(texts, input_type, truncation) - - # Determine batch size if not provided - if batch_size is None: - batch_size = self._get_batch_size() - - try: - embeddings: List = [] - for batch in self.batchify(texts, batch_size): - response = await self._aclient.embed( - texts=batch, - model=self.model, - input_type=input_type, - truncation=truncation, - **kwargs, - ) - embeddings.extend(response.embeddings) - return embeddings - except Exception as e: - raise ValueError(f"Embedding texts failed: {e}") - - @property - def type(self) -> str: - return "voyageai" + contents = contents or texts + return await super().aembed_many(contents=contents, **kwargs) diff --git a/redisvl/utils/vectorize/vertexai.py b/redisvl/utils/vectorize/vertexai.py new file mode 100644 index 00000000..b6940743 --- /dev/null +++ b/redisvl/utils/vectorize/vertexai.py @@ -0,0 +1,349 @@ +import os +from functools import cached_property +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from pydantic import ConfigDict +from tenacity import retry, stop_after_attempt, wait_random_exponential +from tenacity.retry import retry_if_not_exception_type + +from redisvl.utils.utils import lazy_import + +if TYPE_CHECKING: + from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache + +from redisvl.utils.vectorize.base import BaseVectorizer + +InvalidArgument = lazy_import("google.api_core.exceptions.InvalidArgument") + + +class VertexAIVectorizer(BaseVectorizer): + """The VertexAIVectorizer uses Google's VertexAI embedding model + API to create embeddings. + + This vectorizer is tailored for use in + environments where integration with Google Cloud Platform (GCP) services is + a key requirement. + + Utilizing this vectorizer requires an active GCP project and location + (region), along with appropriate application credentials. These can be + provided through the `api_config` dictionary or set the GOOGLE_APPLICATION_CREDENTIALS + env var. Additionally, the vertexai python client must be + installed with `pip install google-cloud-aiplatform>=1.26`. + + You can optionally enable caching to improve performance when generating + embeddings for repeated inputs. + + .. code-block:: python + + # Basic usage + vectorizer = VertexAIVectorizer( + model="textembedding-gecko", + api_config={ + "project_id": "your_gcp_project_id", # OR set GCP_PROJECT_ID + "location": "your_gcp_location", # OR set GCP_LOCATION + }) + embedding = vectorizer.embed("Hello, world!") + + # With caching enabled + from redisvl.extensions.cache.embeddings import EmbeddingsCache + cache = EmbeddingsCache(name="vertexai_embeddings_cache") + + vectorizer = VertexAIVectorizer( + model="textembedding-gecko", + api_config={ + "project_id": "your_gcp_project_id", + "location": "your_gcp_location", + }, + cache=cache + ) + + # First call will compute and cache the embedding + embedding1 = vectorizer.embed("Hello, world!") + + # Second call will retrieve from cache + embedding2 = vectorizer.embed("Hello, world!") + + # Batch embedding of multiple texts + embeddings = vectorizer.embed_many( + ["Hello, world!", "Goodbye, world!"], + batch_size=2 + ) + + # Multimodal usage + from vertexai.vision_models import Image, Video + + vectorizer = VertexAIVectorizer( + model="multimodalembedding@001", + api_config={ + "project_id": "your_gcp_project_id", # OR set GCP_PROJECT_ID + "location": "your_gcp_location", # OR set GCP_LOCATION + } + ) + text_embedding = vectorizer.embed("Hello, world!") + image_embedding = vectorizer.embed(Image.load_from_file("path/to/your/image.jpg")) + video_embedding = vectorizer.embed(Video.load_from_file("path/to/your/video.mp4")) + + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__( + self, + model: str = "textembedding-gecko", + api_config: Optional[Dict] = None, + dtype: str = "float32", + cache: Optional["EmbeddingsCache"] = None, + **kwargs, + ): + """Initialize the VertexAI vectorizer. + + Args: + model (str): Model to use for embedding. Defaults to + 'textembedding-gecko'. + api_config (Optional[Dict], optional): Dictionary containing the + API config details. Defaults to None. + dtype (str): the default datatype to use when embedding text as byte arrays. + Used when setting `as_buffer=True` in calls to embed() and embed_many(). + Defaults to 'float32'. + cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for + better performance with repeated texts. Defaults to None. + + Raises: + ImportError: If the google-cloud-aiplatform library is not installed. + ValueError: If the API key is not provided. + ValueError: If an invalid dtype is provided. + """ + super().__init__(model=model, dtype=dtype, cache=cache) + # Initialize client and set up the model + self._setup(api_config, **kwargs) + + @property + def is_multimodal(self) -> bool: + """Whether a multimodal model has been configured.""" + return "multimodal" in self.model + + @cached_property + def _client(self): + """Get the appropriate client based on the model type.""" + if self.is_multimodal: + from vertexai.vision_models import MultiModalEmbeddingModel + + return MultiModalEmbeddingModel.from_pretrained(self.model) + + from vertexai.language_models import TextEmbeddingModel + + return TextEmbeddingModel.from_pretrained(self.model) + + def embed_image(self, image_path: str, **kwargs) -> Union[List[float], bytes]: + """Embed an image (from its path on disk) using a VertexAI multimodal model.""" + if not self.is_multimodal: + raise ValueError("Cannot embed image with a non-multimodal model.") + + from vertexai.vision_models import Image + + return self.embed(Image.load_from_file(image_path), **kwargs) + + def embed_video(self, video_path: str, **kwargs) -> Union[List[float], bytes]: + """Embed a video (from its path on disk) using a VertexAI multimodal model.""" + if not self.is_multimodal: + raise ValueError("Cannot embed video with a non-multimodal model.") + + from vertexai.vision_models import Video + + return self.embed(Video.load_from_file(video_path), **kwargs) + + def _setup(self, api_config: Optional[Dict], **kwargs): + """Set up the VertexAI client and determine the embedding dimensions.""" + # Initialize client + self._initialize_client(api_config, **kwargs) + # Set model dimensions after initialization + self.dims = self._set_model_dims() + + def _initialize_client(self, api_config: Optional[Dict], **kwargs): + """ + Setup the VertexAI client using the provided config options or + environment variables. + + Args: + api_config: Dictionary with GCP configuration options + **kwargs: Additional arguments for initialization + + Raises: + ImportError: If the google-cloud-aiplatform library is not installed + ValueError: If required parameters are not provided + """ + # Fetch the project_id and location from api_config or environment variables + project_id = ( + api_config.get("project_id") if api_config else os.getenv("GCP_PROJECT_ID") + ) + location = ( + api_config.get("location") if api_config else os.getenv("GCP_LOCATION") + ) + + if not project_id: + raise ValueError( + "Missing project_id. " + "Provide the id in the api_config with key 'project_id' " + "or set the GCP_PROJECT_ID environment variable." + ) + + if not location: + raise ValueError( + "Missing location. " + "Provide the location (region) in the api_config with key 'location' " + "or set the GCP_LOCATION environment variable." + ) + + # Check for credentials + credentials = api_config.get("credentials") if api_config else None + + try: + import vertexai + + vertexai.init( + project=project_id, location=location, credentials=credentials + ) + + except ImportError: + raise ImportError( + "VertexAI vectorizer requires the google-cloud-aiplatform library. " + "Please install with `pip install google-cloud-aiplatform>=1.26`" + ) + + def _set_model_dims(self) -> int: + """ + Determine the dimensionality of the embedding model by making a test call. + + Returns: + int: Dimensionality of the embedding model + + Raises: + ValueError: If embedding dimensions cannot be determined + """ + try: + # Call the protected _embed method to avoid caching this test embedding + embedding = self._embed("dimension check") + return len(embedding) + except (KeyError, IndexError) as ke: + raise ValueError(f"Unexpected response from the VertexAI API: {str(ke)}") + except Exception as e: # pylint: disable=broad-except + # fall back (TODO get more specific) + raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def _embed(self, content: Any, **kwargs) -> List[float]: + """ + Generate a vector embedding for a single input using the VertexAI API. + + Args: + content: Input to embed + **kwargs: Additional parameters to pass to the VertexAI API + + Returns: + List[float]: Vector embedding as a list of floats + + Raises: + ValueError: If embedding fails + """ + try: + if self.is_multimodal: + from vertexai.vision_models import Image, Video + + if isinstance(content, str): + result = self._client.get_embeddings( + contextual_text=content, + **kwargs, + ) + if result.text_embedding is None: + raise ValueError("No text embedding returned from VertexAI.") + return result.text_embedding + elif isinstance(content, Image): + result = self._client.get_embeddings( + image=content, + **kwargs, + ) + if result.image_embedding is None: + raise ValueError("No image embedding returned from VertexAI.") + return result.image_embedding + elif isinstance(content, Video): + result = self._client.get_embeddings( + video=content, + **kwargs, + ) + if result.video_embeddings is None: + raise ValueError("No video embedding returned from VertexAI.") + return result.video_embeddings[0].embedding + else: + raise TypeError( + "Invalid input type for multimodal embedding. " + "Must be str, Image, or Video." + ) + + else: + return self._client.get_embeddings([content], **kwargs)[0].values + + except InvalidArgument as e: + raise TypeError(f"Invalid input for embedding: {str(e)}") from e + except Exception as e: + raise ValueError(f"Embedding input failed: {e}") + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def _embed_many( + self, contents: List[str], batch_size: int = 10, **kwargs + ) -> List[List[float]]: + """ + Generate vector embeddings for a batch of texts using the VertexAI API. + + Args: + contents: List of texts to embed + batch_size: Number of texts to process in each API call + **kwargs: Additional parameters to pass to the VertexAI API + + Returns: + List[List[float]]: List of vector embeddings as lists of floats + + Raises: + TypeError: If contents is not a list of strings + ValueError: If embedding fails + """ + if self.is_multimodal: + raise NotImplementedError( + "Batch embedding is not supported for multimodal models with VertexAI." + ) + if not isinstance(contents, list): + raise TypeError("Must pass in a list of str values to embed.") + if contents and not isinstance(contents[0], str): + raise TypeError("Must pass in a list of str values to embed.") + + try: + embeddings: List = [] + for batch in self.batchify(contents, batch_size): + response = self._client.get_embeddings(batch, **kwargs) + embeddings.extend([r.values for r in response]) + return embeddings + except InvalidArgument as e: + raise TypeError(f"Invalid input for embedding: {str(e)}") from e + except Exception as e: + raise ValueError(f"Embedding texts failed: {e}") + + def _serialize_for_cache(self, content: Any) -> Union[bytes, str]: + """Convert content to a cacheable format.""" + from vertexai.vision_models import Image, Video + + if isinstance(content, Image): + return content._image_bytes + elif isinstance(content, Video): + return content._video_bytes + return super()._serialize_for_cache(content) + + @property + def type(self) -> str: + return "vertexai" diff --git a/redisvl/utils/vectorize/voyageai.py b/redisvl/utils/vectorize/voyageai.py new file mode 100644 index 00000000..73a00ecf --- /dev/null +++ b/redisvl/utils/vectorize/voyageai.py @@ -0,0 +1,453 @@ +import os +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from pydantic import ConfigDict +from tenacity import retry, stop_after_attempt, wait_random_exponential +from tenacity.retry import retry_if_not_exception_type + +if TYPE_CHECKING: + from redisvl.extensions.cache.embeddings.embeddings import EmbeddingsCache + +from redisvl.utils.vectorize.base import BaseVectorizer + +# ignore that voyageai isn't imported +# mypy: disable-error-code="name-defined" + + +class VoyageAIVectorizer(BaseVectorizer): + """The VoyageAIVectorizer class utilizes VoyageAI's API to generate + embeddings for text and multimodal (text / image / video) data. + + This vectorizer is designed to interact with VoyageAI's /embed and /multimodal_embed APIs, + requiring an API key for authentication. The key can be provided + directly in the `api_config` dictionary or through the `VOYAGE_API_KEY` + environment variable. User must obtain an API key from VoyageAI's website + (https://dash.voyageai.com/). Additionally, the `voyageai` python + client must be installed with `pip install voyageai`. For image embeddings, the Pillow + library must also be installed with `pip install pillow`. + + The vectorizer supports both synchronous and asynchronous operations, allows for batch + processing of content and flexibility in handling preprocessing tasks. + + You can optionally enable caching to improve performance when generating + embeddings for repeated text inputs. + + .. code-block:: python + + from redisvl.utils.vectorize import VoyageAIVectorizer + + # Basic usage + vectorizer = VoyageAIVectorizer( + model="voyage-3-large", + api_config={"api_key": "your-voyageai-api-key"} # OR set VOYAGE_API_KEY in your env + ) + query_embedding = vectorizer.embed( + content="your input query text here", + input_type="query" + ) + doc_embeddings = vectorizer.embed_many( + contents=["your document text", "more document text"], + input_type="document" + ) + + # Multimodal usage - requires Pillow and voyageai>=0.3.6 + + vectorizer = VoyageAIVectorizer( + model="voyage-multimodal-3.5", + api_config={"api_key": "your-voyageai-api-key"} # OR set VOYAGE_API_KEY in your env + ) + image_embedding = vectorizer.embed_image( + "path/to/your/image.jpg", + input_type="query" + ) + video_embedding = vectorizer.embed_video( + "path/to/your/video.mp4", + input_type="document" + ) + + # With caching enabled + from redisvl.extensions.cache.embeddings import EmbeddingsCache + cache = EmbeddingsCache(name="voyageai_embeddings_cache") + + vectorizer = VoyageAIVectorizer( + model="voyage-3-large", + api_config={"api_key": "your-voyageai-api-key"}, + cache=cache + ) + + # First call will compute and cache the embedding + embedding1 = vectorizer.embed( + content="your input query text here", + input_type="query" + ) + + # Second call will retrieve from cache + embedding2 = vectorizer.embed( + content="your input query text here", + input_type="query" + ) + + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__( + self, + model: str = "voyage-3-large", + api_config: Optional[Dict] = None, + dtype: str = "float32", + cache: Optional["EmbeddingsCache"] = None, + **kwargs, + ): + """Initialize the VoyageAI vectorizer. + + Visit https://docs.voyageai.com/docs/embeddings to learn about embeddings and check the available models. + + Args: + model (str): Model to use for embedding. Defaults to "voyage-3-large". + api_config (Optional[Dict], optional): Dictionary containing the API key. + Defaults to None. + dtype (str): the default datatype to use when embedding content as byte arrays. + Used when setting `as_buffer=True` in calls to embed() and embed_many(). + Defaults to 'float32'. + cache (Optional[EmbeddingsCache]): Optional EmbeddingsCache instance to cache embeddings for + better performance with repeated items. Defaults to None. + + Raises: + ImportError: If the voyageai library is not installed. + ValueError: If the API key is not provided. + + Notes: + - Multimodal models require voyageai>=0.3.6 to be installed for video embeddings, as well as + ffmpeg installed on the system. Image embeddings require pillow to be installed. + + """ + super().__init__(model=model, dtype=dtype, cache=cache) + # Initialize client and set up the model + self._setup(api_config, **kwargs) + + @property + def is_multimodal(self) -> bool: + """Whether a multimodal model has been configured.""" + return "multimodal" in self.model + + def embed_image(self, image_path: str, **kwargs) -> Union[List[float], bytes]: + """Embed an image (from its path on disk) using VoyageAI's multimodal API. Requires pillow to be installed.""" + if not self.is_multimodal: + raise ValueError("Cannot embed image with a non-multimodal model.") + + try: + from PIL import Image + except ImportError: + raise ImportError( + "Pillow library is required for image embedding. " + "Please install with `pip install pillow`" + ) + return self.embed(Image.open(image_path), **kwargs) + + def embed_video(self, video_path: str, **kwargs) -> Union[List[float], bytes]: + """Embed a video (from its path on disk) using VoyageAI's multimodal API. + + Requires voyageai>=0.3.6 to be installed, as well as ffmpeg to be installed on the system. + """ + if not self.is_multimodal: + raise ValueError("Cannot embed video with a non-multimodal model.") + + try: + from voyageai.video_utils import Video + except ModuleNotFoundError: + raise ModuleNotFoundError( + "voyageai>=0.3.6 is required for video embedding. " + "Please install with `pip install voyageai>=0.3.6`" + ) + + video = Video.from_path( + video_path, + model=self.model, + ) + return self.embed(video, **kwargs) + + def _setup(self, api_config: Optional[Dict], **kwargs): + """Set up the VoyageAI client and determine the embedding dimensions.""" + # Initialize client + self._initialize_client(api_config, **kwargs) + + if self.is_multimodal: + self._embed_fn = self._client.multimodal_embed + self._aembed_fn = self._aclient.multimodal_embed + else: + self._embed_fn = self._client.embed # type: ignore[assignment] + self._aembed_fn = self._aclient.embed # type: ignore[assignment] + + # Set model dimensions after initialization + self.dims = self._set_model_dims() + + def _initialize_client(self, api_config: Optional[Dict], **kwargs): + """ + Setup the VoyageAI clients using the provided API key or an + environment variable. + + Args: + api_config: Dictionary with API configuration options + **kwargs: Additional arguments to pass to VoyageAI clients + + Raises: + ImportError: If the voyageai library is not installed + ValueError: If no API key is provided + """ + if api_config is None: + api_config = {} + + # Dynamic import of the voyageai module + try: + from voyageai import AsyncClient, Client + except ImportError: + raise ImportError( + "VoyageAI vectorizer requires the voyageai library. " + "Please install with `pip install voyageai`" + ) + + # Fetch the API key from api_config or environment variable + api_key = ( + api_config.get("api_key") if api_config else os.getenv("VOYAGE_API_KEY") + ) + if not api_key: + raise ValueError( + "VoyageAI API key is required. " + "Provide it in api_config or set the VOYAGE_API_KEY environment variable." + ) + + self._client = Client(api_key=api_key, **kwargs) + self._aclient = AsyncClient(api_key=api_key, **kwargs) + + def _set_model_dims(self) -> int: + """ + Determine the dimensionality of the embedding model by making a test call. + + Returns: + int: Dimensionality of the embedding model + + Raises: + ValueError: If embedding dimensions cannot be determined + """ + try: + # Call the protected _embed method to avoid caching this test embedding + embedding = self._embed("dimension check", input_type="document") + return len(embedding) + except (KeyError, IndexError) as ke: + raise ValueError(f"Unexpected response from the VoyageAI API: {str(ke)}") + except Exception as e: # pylint: disable=broad-except + # fall back (TODO get more specific) + raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + + def _get_batch_size(self) -> int: + """ + Determine the appropriate batch size based on the model being used. + + Returns: + int: Recommended batch size for the current model + """ + if self.model in ["voyage-2", "voyage-02"]: + return 72 + elif self.model in ["voyage-3-lite", "voyage-3.5-lite"]: + return 30 + elif self.model in ["voyage-3", "voyage-3.5"]: + return 10 + else: + return 7 # Default for other models + + def _validate_input( + self, contents: List[Any], input_type: Optional[str], truncation: Optional[bool] + ): + """ + Validate the inputs to the embedding methods. + + Args: + contents: List of items to embed + input_type: Type of input (document or query) + truncation: Whether to truncate long inputs + + Raises: + TypeError: If inputs are invalid + """ + if not isinstance(contents, list): + raise TypeError( + "Must pass in a list of str, PIL.Image.Image, or voyageai.video_utils.Video values to embed.", + ) + if not self.is_multimodal and contents and not isinstance(contents[0], str): + raise TypeError("Must pass in a list of str values to embed.") + if input_type is not None and input_type not in ["document", "query"]: + raise TypeError( + "Must pass in a allowed value for voyageai embedding input_type. " + "See https://docs.voyageai.com/docs/embeddings." + ) + if truncation is not None and not isinstance(truncation, bool): + raise TypeError("Truncation (optional) parameter is a bool.") + + def _embed(self, content: Any, **kwargs) -> List[float]: + """ + Generate a vector embedding for a single item using the VoyageAI API. + + Args: + content: Item to embed - must be one of str, PIL.Image.Image, or voyageai.video_utils.Video. Images and + video require a multimodal model to be configured. + **kwargs: Additional parameters to pass to the VoyageAI API + + Returns: + List[float]: Vector embedding as a list of floats + + Raises: + TypeError: If parameters are invalid + ValueError: If embedding fails + """ + # Simply call _embed_many with a single input and return the first result + result = self._embed_many([content], **kwargs) + return result[0] + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def _embed_many( + self, contents: List[Any], batch_size: Optional[int] = None, **kwargs + ) -> List[List[float]]: + """ + Generate vector embeddings for a batch of items using the VoyageAI API. + + Args: + contents: List of items to embed - each item must be one of str, PIL.Image.Image, or + voyageai.video_utils.Video. Images and video require a multimodal model to be configured. + batch_size: Number of items to process in each API call + **kwargs: Additional parameters to pass to the VoyageAI API + + Returns: + List[List[float]]: List of vector embeddings as lists of floats + + Raises: + TypeError: If `contents` is not a list, or parameters are invalid + ValueError: If embedding fails + """ + from voyageai.error import InvalidRequestError + + input_type = kwargs.pop("input_type", None) + truncation = kwargs.pop("truncation", True) + + # Validate inputs + self._validate_input(contents, input_type, truncation) + + # Determine batch size if not provided + if batch_size is None: + batch_size = self._get_batch_size() + + try: + embeddings: List = [] + for batch in self.batchify(contents, batch_size): + response = self._embed_fn( + ( + [batch] if self.is_multimodal else batch + ), # Multimodal requires a list of lists/dicts + model=self.model, + input_type=input_type, + truncation=truncation, + **kwargs, # type: ignore + ) + embeddings.extend(response.embeddings) + return embeddings + except InvalidRequestError as e: + raise TypeError(f"Invalid input for embedding: {str(e)}") from e + except Exception as e: + raise ValueError(f"Embedding texts failed: {e}") + + async def _aembed(self, content: Any, **kwargs) -> List[float]: + """ + Asynchronously generate a vector embedding for a single item using the VoyageAI API. + + Args: + content: Item to embed - must be one of str, PIL.Image.Image, or voyageai.video_utils.Video. Images and + video require a multimodal model to be configured. + **kwargs: Additional parameters to pass to the VoyageAI API + + Returns: + List[float]: Vector embedding as a list of floats + + Raises: + TypeError: If parameters are invalid + ValueError: If embedding fails + """ + # Simply call _aembed_many with a single item and return the first result + result = await self._aembed_many([content], **kwargs) + return result[0] + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + async def _aembed_many( + self, contents: List[Any], batch_size: Optional[int] = None, **kwargs + ) -> List[List[float]]: + """ + Asynchronously generate vector embeddings for a batch of items using the VoyageAI API. + + Args: + contents: List of items to embed - each item must be one of str, PIL.Image.Image, or + voyageai.video_utils.Video. Images and video require a multimodal model to be configured. + batch_size: Number of texts to process in each API call + **kwargs: Additional parameters to pass to the VoyageAI API + + Returns: + List[List[float]]: List of vector embeddings as lists of floats + + Raises: + TypeError: If `contents` is not a list, or parameters are invalid + ValueError: If embedding fails + """ + from voyageai.error import InvalidRequestError + + input_type = kwargs.pop("input_type", None) + truncation = kwargs.pop("truncation", True) + + # Validate inputs + self._validate_input(contents, input_type, truncation) + + # Determine batch size if not provided + if batch_size is None: + batch_size = self._get_batch_size() + + try: + embeddings: List = [] + for batch in self.batchify(contents, batch_size): + response = await self._aembed_fn( + ( + [batch] if self.is_multimodal else batch + ), # Multimodal requires a list of lists/dicts + model=self.model, + input_type=input_type, + truncation=truncation, + **kwargs, # type: ignore + ) + embeddings.extend(response.embeddings) + return embeddings + except InvalidRequestError as e: + raise TypeError(f"Invalid input for embedding: {str(e)}") from e + except Exception as e: + raise ValueError(f"Embedding texts failed: {e}") + + def _serialize_for_cache(self, content: Any) -> Union[bytes, str]: + """Convert content to a cacheable format.""" + try: + from voyageai.video_utils import Video + except ModuleNotFoundError: + raise ModuleNotFoundError( + "voyageai>=0.3.6 is required for video embedding. " + "Please install with `pip install voyageai>=0.3.6`" + ) + + if isinstance(content, Video): + return content.to_bytes() + return super()._serialize_for_cache(content) + + @property + def type(self) -> str: + return "voyageai" diff --git a/tests/integration/test_embedcache.py b/tests/integration/test_embedcache.py index ca2ed301..fd20204e 100644 --- a/tests/integration/test_embedcache.py +++ b/tests/integration/test_embedcache.py @@ -52,19 +52,19 @@ def sample_embedding_data(): """Sample data for embedding cache tests.""" return [ { - "text": "What is machine learning?", + "content": "What is machine learning?", "model_name": "text-embedding-ada-002", "embedding": [0.1, 0.2, 0.3, 0.4, 0.5], "metadata": {"source": "user_query", "category": "ai"}, }, { - "text": "How do neural networks work?", + "content": "How do neural networks work?", "model_name": "text-embedding-ada-002", "embedding": [0.2, 0.3, 0.4, 0.5, 0.6], "metadata": {"source": "documentation", "category": "ai"}, }, { - "text": "What's the weather like today?", + "content": "What's the weather like today?", "model_name": "text-embedding-ada-002", "embedding": [0.5, 0.6, 0.7, 0.8, 0.9], "metadata": {"source": "user_query", "category": "weather"}, @@ -94,12 +94,12 @@ def test_cache_initialization(redis_url): def test_make_entry_id(): """Test that entry IDs are generated consistently.""" cache = EmbeddingsCache() - text = "Hello world" + content = "Hello world" model_name = "text-embedding-ada-002" # Test deterministic ID generation - entry_id1 = cache._make_entry_id(text, model_name) - entry_id2 = cache._make_entry_id(text, model_name) + entry_id1 = cache._make_entry_id(content, model_name) + entry_id2 = cache._make_entry_id(content, model_name) assert entry_id1 == entry_id2 # Test different inputs produce different IDs @@ -108,7 +108,7 @@ def test_make_entry_id(): # Test ID format assert isinstance(entry_id1, str) - expected_id = hashify(f"{text}:{model_name}") + expected_id = hashify(f"{content}:{model_name}") assert entry_id1 == expected_id @@ -140,18 +140,18 @@ def test_set_and_get(cache, sample_embedding_data): # Set the entry key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], metadata=sample["metadata"], ) # Get the entry - result = cache.get(sample["text"], sample["model_name"]) + result = cache.get(sample["content"], sample["model_name"]) # Verify the result assert result is not None - assert result["text"] == sample["text"] + assert result["content"] == sample["content"] assert result["model_name"] == sample["model_name"] assert "embedding" in result assert result["metadata"] == sample["metadata"] @@ -159,7 +159,7 @@ def test_set_and_get(cache, sample_embedding_data): # Test get_by_key key_result = cache.get_by_key(key) assert key_result is not None - assert key_result["text"] == sample["text"] + assert key_result["content"] == sample["content"] # Test non-existent entry missing = cache.get("NonexistentText", sample["model_name"]) @@ -175,17 +175,17 @@ def test_exists(cache, sample_embedding_data): sample = sample_embedding_data[0] # Entry shouldn't exist yet - assert not cache.exists(sample["text"], sample["model_name"]) + assert not cache.exists(sample["content"], sample["model_name"]) # Add the entry key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) # Now it should exist - assert cache.exists(sample["text"], sample["model_name"]) + assert cache.exists(sample["content"], sample["model_name"]) # Test exists_by_key assert cache.exists_by_key(key) @@ -201,7 +201,7 @@ def test_drop(cache, sample_embedding_data): # Add the entry key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) @@ -210,14 +210,14 @@ def test_drop(cache, sample_embedding_data): assert cache.exists_by_key(key) # Remove it - cache.drop(sample["text"], sample["model_name"]) + cache.drop(sample["content"], sample["model_name"]) # Verify it's gone assert not cache.exists_by_key(key) # Test drop_by_key key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) @@ -231,7 +231,7 @@ def test_ttl_expiration(cache_with_ttl, sample_embedding_data): # Add the entry key = cache_with_ttl.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) @@ -252,7 +252,7 @@ def test_custom_ttl(cache, sample_embedding_data): # Add the entry with a 1 second TTL key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ttl=1, @@ -274,7 +274,7 @@ def test_multiple_entries(cache, sample_embedding_data): keys = [] for sample in sample_embedding_data: key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], metadata=sample.get("metadata"), @@ -285,7 +285,7 @@ def test_multiple_entries(cache, sample_embedding_data): for i, key in enumerate(keys): assert cache.exists_by_key(key) result = cache.get_by_key(key) - assert result["text"] == sample_embedding_data[i]["text"] + assert result["content"] == sample_embedding_data[i]["content"] # Drop one entry cache.drop_by_key(keys[0]) @@ -300,18 +300,18 @@ async def test_async_set_and_get(cache, sample_embedding_data): # Set the entry key = await cache.aset( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], metadata=sample["metadata"], ) # Get the entry - result = await cache.aget(sample["text"], sample["model_name"]) + result = await cache.aget(sample["content"], sample["model_name"]) # Verify the result assert result is not None - assert result["text"] == sample["text"] + assert result["content"] == sample["content"] assert result["model_name"] == sample["model_name"] assert "embedding" in result assert result["metadata"] == sample["metadata"] @@ -319,7 +319,7 @@ async def test_async_set_and_get(cache, sample_embedding_data): # Test aget_by_key key_result = await cache.aget_by_key(key) assert key_result is not None - assert key_result["text"] == sample["text"] + assert key_result["content"] == sample["content"] @pytest.mark.asyncio @@ -328,17 +328,17 @@ async def test_async_exists(cache, sample_embedding_data): sample = sample_embedding_data[0] # Entry shouldn't exist yet - assert not await cache.aexists(sample["text"], sample["model_name"]) + assert not await cache.aexists(sample["content"], sample["model_name"]) # Add the entry key = await cache.aset( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) # Now it should exist - assert await cache.aexists(sample["text"], sample["model_name"]) + assert await cache.aexists(sample["content"], sample["model_name"]) # Test aexists_by_key assert await cache.aexists_by_key(key) @@ -351,7 +351,7 @@ async def test_async_drop(cache, sample_embedding_data): # Add the entry key = await cache.aset( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) @@ -360,14 +360,14 @@ async def test_async_drop(cache, sample_embedding_data): assert await cache.aexists_by_key(key) # Remove it - await cache.adrop(sample["text"], sample["model_name"]) + await cache.adrop(sample["content"], sample["model_name"]) # Verify it's gone assert not await cache.aexists_by_key(key) # Test adrop_by_key key = await cache.aset( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) @@ -382,7 +382,7 @@ async def test_async_ttl_expiration(cache_with_ttl, sample_embedding_data): # Add the entry key = await cache_with_ttl.aset( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) @@ -402,11 +402,11 @@ def test_entry_id_consistency(cache, sample_embedding_data): sample = sample_embedding_data[0] # Generate an entry ID directly - expected_id = cache._make_entry_id(sample["text"], sample["model_name"]) + expected_id = cache._make_entry_id(sample["content"], sample["model_name"]) # Set an entry and extract its ID from the key key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) @@ -429,14 +429,14 @@ def test_redis_client_reuse(cache_with_redis_client, sample_embedding_data): # Set and get an entry key = cache_with_redis_client.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) result = cache_with_redis_client.get_by_key(key) assert result is not None - assert result["text"] == sample["text"] + assert result["content"] == sample["content"] def test_mset_and_mget(cache, sample_embedding_data): @@ -446,7 +446,7 @@ def test_mset_and_mget(cache, sample_embedding_data): for sample in sample_embedding_data: batch_items.append( { - "text": sample["text"], + "content": sample["content"], "model_name": sample["model_name"], "embedding": sample["embedding"], "metadata": sample.get("metadata"), @@ -457,18 +457,18 @@ def test_mset_and_mget(cache, sample_embedding_data): keys = cache.mset(batch_items) assert len(keys) == len(batch_items) - # Get texts and model name for mget - texts = [item["text"] for item in batch_items] + # Get contents and model name for mget + contents = [item["content"] for item in batch_items] model_name = batch_items[0]["model_name"] # Assuming same model # Test mget - results = cache.mget(texts, model_name) - assert len(results) == len(texts) + results = cache.mget(contents, model_name) + assert len(results) == len(contents) # Verify all results are returned and in correct order for i, result in enumerate(results): assert result is not None - assert result["text"] == texts[i] + assert result["content"] == contents[i] assert result["model_name"] == model_name @@ -478,7 +478,7 @@ def test_mget_by_keys(cache, sample_embedding_data): keys = [] for sample in sample_embedding_data: key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], metadata=sample.get("metadata"), @@ -492,7 +492,7 @@ def test_mget_by_keys(cache, sample_embedding_data): # Verify all results match the original samples for i, result in enumerate(results): assert result is not None - assert result["text"] == sample_embedding_data[i]["text"] + assert result["content"] == sample_embedding_data[i]["content"] assert result["model_name"] == sample_embedding_data[i]["model_name"] # Test with mix of existing and non-existing keys @@ -510,31 +510,31 @@ def test_mexists_and_mexists_by_keys(cache, sample_embedding_data): """Test batch existence checks for embeddings.""" # Set embeddings individually and collect data keys = [] - texts = [] + contents = [] for sample in sample_embedding_data: key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) keys.append(key) - texts.append(sample["text"]) + contents.append(sample["content"]) model_name = sample_embedding_data[0]["model_name"] # Assuming same model # Test mexists - exist_results = cache.mexists(texts, model_name) - assert len(exist_results) == len(texts) + exist_results = cache.mexists(contents, model_name) + assert len(exist_results) == len(contents) assert all(exist_results) # All should exist - # Test with mix of existing and non-existing texts - non_existent_text = "This text does not exist" - mixed_texts = texts[:1] + [non_existent_text] + texts[1:] - mixed_results = cache.mexists(mixed_texts, model_name) + # Test with mix of existing and non-existing contents + non_existent_content = "This content does not exist" + mixed_contents = contents[:1] + [non_existent_content] + contents[1:] + mixed_results = cache.mexists(mixed_contents, model_name) - assert len(mixed_results) == len(mixed_texts) + assert len(mixed_results) == len(mixed_contents) assert mixed_results[0] is True - assert mixed_results[1] is False # Non-existent text should return False + assert mixed_results[1] is False # Non-existent content should return False assert mixed_results[2] is True # Test mexists_by_keys @@ -557,15 +557,15 @@ def test_mdrop_and_mdrop_by_keys(cache, sample_embedding_data): """Test batch deletion of embeddings.""" # Set embeddings and collect data keys = [] - texts = [] + contents = [] for sample in sample_embedding_data: key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) keys.append(key) - texts.append(sample["text"]) + contents.append(sample["content"]) model_name = sample_embedding_data[0]["model_name"] # Assuming same model @@ -583,26 +583,26 @@ def test_mdrop_and_mdrop_by_keys(cache, sample_embedding_data): # Reset for mdrop test cache.clear() keys = [] - texts = [] + contents = [] for sample in sample_embedding_data: key = cache.set( - text=sample["text"], + content=sample["content"], model_name=sample["model_name"], embedding=sample["embedding"], ) keys.append(key) - texts.append(sample["text"]) + contents.append(sample["content"]) - # Test mdrop with subset of texts - subset_texts = texts[:2] - cache.mdrop(subset_texts, model_name) + # Test mdrop with subset of contents + subset_contents = contents[:2] + cache.mdrop(subset_contents, model_name) - # Verify only selected texts were dropped - for i, text in enumerate(texts): + # Verify only selected contents were dropped + for i, content in enumerate(contents): if i < 2: - assert not cache.exists(text, model_name) # Should be dropped + assert not cache.exists(content, model_name) # Should be dropped else: - assert cache.exists(text, model_name) # Should still exist + assert cache.exists(content, model_name) # Should still exist @pytest.mark.asyncio @@ -613,7 +613,7 @@ async def test_async_batch_operations(cache, sample_embedding_data): for sample in sample_embedding_data: batch_items.append( { - "text": sample["text"], + "content": sample["content"], "model_name": sample["model_name"], "embedding": sample["embedding"], "metadata": sample.get("metadata"), @@ -624,16 +624,16 @@ async def test_async_batch_operations(cache, sample_embedding_data): keys = await cache.amset(batch_items) assert len(keys) == len(batch_items) - # Get texts and model name for amget - texts = [item["text"] for item in batch_items] + # Get contents and model name for amget + contents = [item["content"] for item in batch_items] model_name = batch_items[0]["model_name"] # Assuming same model # Test amget - results = await cache.amget(texts, model_name) - assert len(results) == len(texts) + results = await cache.amget(contents, model_name) + assert len(results) == len(contents) for i, result in enumerate(results): assert result is not None - assert result["text"] == texts[i] + assert result["content"] == contents[i] # Test amget_by_keys key_results = await cache.amget_by_keys(keys) @@ -642,8 +642,8 @@ async def test_async_batch_operations(cache, sample_embedding_data): assert result is not None # Test amexists - exist_results = await cache.amexists(texts, model_name) - assert len(exist_results) == len(texts) + exist_results = await cache.amexists(contents, model_name) + assert len(exist_results) == len(contents) assert all(exist_results) # All should exist # Test amexists_by_keys @@ -651,9 +651,9 @@ async def test_async_batch_operations(cache, sample_embedding_data): assert len(key_exist_results) == len(keys) assert all(key_exist_results) # All should exist - # Test amdrop with first text - await cache.amdrop([texts[0]], model_name) - updated_exists = await cache.aexists(texts[0], model_name) + # Test amdrop with first content + await cache.amdrop([contents[0]], model_name) + updated_exists = await cache.aexists(contents[0], model_name) assert not updated_exists # Should be dropped # Test amdrop_by_keys with second key @@ -697,7 +697,7 @@ def test_batch_with_ttl(cache_with_ttl, sample_embedding_data): for sample in sample_embedding_data: batch_items.append( { - "text": sample["text"], + "content": sample["content"], "model_name": sample["model_name"], "embedding": sample["embedding"], "metadata": sample.get("metadata"), @@ -736,7 +736,7 @@ def test_large_batch_operations(cache): for i in range(100): large_batch.append( { - "text": f"Sample text {i}", + "content": f"Sample text {i}", "model_name": "test-model", "embedding": [float(i) / 100] * 5, "metadata": {"index": i}, @@ -752,11 +752,11 @@ def test_large_batch_operations(cache): assert len(results) == 100 assert all(result is not None for result in results) - # Get texts for batch retrieval - texts = [item["text"] for item in large_batch] + # Get contents for batch retrieval + contents = [item["content"] for item in large_batch] - # Test retrieving by texts - results = cache.mget(texts, "test-model") + # Test retrieving by contents + results = cache.mget(contents, "test-model") assert len(results) == 100 assert all(result is not None for result in results) diff --git a/tests/integration/test_embedcache_warnings.py b/tests/integration/test_embedcache_warnings.py index 059d6c4c..de2f9b18 100644 --- a/tests/integration/test_embedcache_warnings.py +++ b/tests/integration/test_embedcache_warnings.py @@ -47,7 +47,7 @@ async def test_sync_methods_warn_with_async_only_client(async_client, caplog): caplog.clear() # Second sync method call should NOT warn (flag prevents spam) - _ = cache.set(text="test", model_name="model", embedding=[0.1, 0.2]) + _ = cache.set(content="test", model_name="model", embedding=[0.1, 0.2]) # Should not have logged another warning assert len(caplog.records) == 0 @@ -65,7 +65,7 @@ def test_no_warning_with_sync_client(redis_url): with patch("redisvl.utils.log.get_logger") as mock_logger: # Sync methods should not warn _ = cache.get_by_key("test_key") - _ = cache.set(text="test", model_name="model", embedding=[0.1, 0.2]) + _ = cache.set(content="test", model_name="model", embedding=[0.1, 0.2]) # No warnings should have been logged mock_logger.return_value.warning.assert_not_called() @@ -82,7 +82,7 @@ async def test_async_methods_no_warning(async_client): with patch("redisvl.utils.log.get_logger") as mock_logger: # Async methods should not warn _ = await cache.aget_by_key("test_key") - _ = await cache.aset(text="test", model_name="model", embedding=[0.1, 0.2]) + _ = await cache.aset(content="test", model_name="model", embedding=[0.1, 0.2]) # No warnings should have been logged mock_logger.return_value.warning.assert_not_called() diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py index d5727664..ff7e860b 100644 --- a/tests/integration/test_vectorizers.py +++ b/tests/integration/test_vectorizers.py @@ -1,4 +1,5 @@ import os +import warnings import numpy as np import pytest @@ -7,14 +8,14 @@ from redisvl.utils.utils import create_ulid from redisvl.utils.vectorize import ( AzureOpenAITextVectorizer, - BedrockTextVectorizer, + BedrockVectorizer, CohereTextVectorizer, - CustomTextVectorizer, + CustomVectorizer, HFTextVectorizer, MistralAITextVectorizer, OpenAITextVectorizer, - VertexAITextVectorizer, - VoyageAITextVectorizer, + VertexAIVectorizer, + VoyageAIVectorizer, ) # Constants for testing @@ -41,13 +42,13 @@ def embeddings_cache(client): params=[ HFTextVectorizer, OpenAITextVectorizer, - VertexAITextVectorizer, + VertexAIVectorizer, CohereTextVectorizer, AzureOpenAITextVectorizer, - BedrockTextVectorizer, + BedrockVectorizer, MistralAITextVectorizer, - CustomTextVectorizer, - VoyageAITextVectorizer, + CustomVectorizer, + VoyageAIVectorizer, ] ) def vectorizer(request): @@ -55,35 +56,35 @@ def vectorizer(request): return request.param() elif request.param == OpenAITextVectorizer: return request.param() - elif request.param == VertexAITextVectorizer: + elif request.param == VertexAIVectorizer: return request.param() elif request.param == CohereTextVectorizer: return request.param() elif request.param == MistralAITextVectorizer: return request.param() - elif request.param == VoyageAITextVectorizer: + elif request.param == VoyageAIVectorizer: return request.param(model="voyage-large-2") elif request.param == AzureOpenAITextVectorizer: return request.param( model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "text-embedding-ada-002") ) - elif request.param == BedrockTextVectorizer: + elif request.param == BedrockVectorizer: return request.param( model=os.getenv("BEDROCK_MODEL_ID", "amazon.titan-embed-text-v2:0") ) - elif request.param == CustomTextVectorizer: + elif request.param == CustomVectorizer: - def embed(text): + def embed(content): return TEST_VECTOR - def embed_many(texts): - return [TEST_VECTOR] * len(texts) + def embed_many(contents): + return [TEST_VECTOR] * len(contents) - async def aembed_func(text): + async def aembed_func(content): return TEST_VECTOR - async def aembed_many_func(texts): - return [TEST_VECTOR] * len(texts) + async def aembed_many_func(contents): + return [TEST_VECTOR] * len(contents) return request.param(embed=embed, embed_many=embed_many) @@ -92,19 +93,19 @@ async def aembed_many_func(texts): def cached_vectorizer(embeddings_cache): """Create a simple custom vectorizer for testing.""" - def embed(text): + def embed(content): return TEST_VECTOR - def embed_many(texts): - return [TEST_VECTOR] * len(texts) + def embed_many(contents): + return [TEST_VECTOR] * len(contents) - async def aembed(text): + async def aembed(content): return TEST_VECTOR - async def aembed_many(texts): - return [TEST_VECTOR] * len(texts) + async def aembed_many(contents): + return [TEST_VECTOR] * len(contents) - return CustomTextVectorizer( + return CustomVectorizer( embed=embed, embed_many=embed_many, aembed=aembed, @@ -115,7 +116,7 @@ async def aembed_many(texts): @pytest.fixture def custom_embed_func(): - def embed(text: str): + def embed(content: str): return TEST_VECTOR return embed @@ -124,16 +125,16 @@ def embed(text: str): @pytest.fixture def custom_embed_class(): class MyEmbedder: - def embed(self, text: str): + def embed(self, content: str): return TEST_VECTOR - def embed_with_args(self, text: str, max_len=None): + def embed_with_args(self, content: str, max_len=None): return TEST_VECTOR[0:max_len] - def embed_many(self, text_list): + def embed_many(self, contents): return [[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]] - def embed_many_with_args(self, texts, param=True): + def embed_many_with_args(self, contents, param=True): if param: return [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] else: @@ -147,7 +148,7 @@ def test_vectorizer_embed(vectorizer): text = TEST_TEXT if isinstance(vectorizer, CohereTextVectorizer): embedding = vectorizer.embed(text, input_type="search_document") - elif isinstance(vectorizer, VoyageAITextVectorizer): + elif isinstance(vectorizer, VoyageAIVectorizer): embedding = vectorizer.embed(text, input_type="document") else: embedding = vectorizer.embed(text) @@ -161,7 +162,7 @@ def test_vectorizer_embed_many(vectorizer): texts = TEST_TEXTS if isinstance(vectorizer, CohereTextVectorizer): embeddings = vectorizer.embed_many(texts, input_type="search_document") - elif isinstance(vectorizer, VoyageAITextVectorizer): + elif isinstance(vectorizer, VoyageAIVectorizer): embeddings = vectorizer.embed_many(texts, input_type="document") else: embeddings = vectorizer.embed_many(texts) @@ -173,18 +174,6 @@ def test_vectorizer_embed_many(vectorizer): ) -@pytest.mark.requires_api_keys -def test_vectorizer_bad_input(vectorizer): - with pytest.raises(TypeError): - vectorizer.embed(1) - - with pytest.raises(TypeError): - vectorizer.embed({"foo": "bar"}) - - with pytest.raises(TypeError): - vectorizer.embed_many(42) - - def test_vectorizer_with_cache(cached_vectorizer): """Test the complete cache flow - miss, store, hit.""" # First call - should be a cache miss @@ -197,7 +186,7 @@ def test_vectorizer_with_cache(cached_vectorizer): # Verify it's actually using the cache by checking the cached value exists cached_entry = cached_vectorizer.cache.get( - text=TEST_TEXT, model_name=cached_vectorizer.model + content=TEST_TEXT, model_name=cached_vectorizer.model ) assert cached_entry is not None assert cached_entry["embedding"] == TEST_VECTOR @@ -209,11 +198,11 @@ def test_vectorizer_with_cache_skip(cached_vectorizer): cached_vectorizer.embed(TEST_TEXT) # Call embed with skip_cache=True - should bypass cache - cached_vectorizer.cache.drop(text=TEST_TEXT, model_name=cached_vectorizer.model) + cached_vectorizer.cache.drop(content=TEST_TEXT, model_name=cached_vectorizer.model) # Store a deliberately different value in the cache cached_vectorizer.cache.set( - text=TEST_TEXT, + content=TEST_TEXT, model_name=cached_vectorizer.model, embedding=[9.9, 8.8, 7.7, 6.6], ) @@ -226,7 +215,7 @@ def test_vectorizer_with_cache_skip(cached_vectorizer): # Cache should still have the original value cached_entry = cached_vectorizer.cache.get( - text=TEST_TEXT, model_name=cached_vectorizer.model + content=TEST_TEXT, model_name=cached_vectorizer.model ) assert cached_entry["embedding"] == [9.9, 8.8, 7.7, 6.6] @@ -235,7 +224,7 @@ def test_vectorizer_with_cache_many(cached_vectorizer): """Test embedding many texts with partial cache hits/misses.""" # Store an embedding for the first text only cached_vectorizer.cache.set( - text=TEST_TEXTS[0], + content=TEST_TEXTS[0], model_name=cached_vectorizer.model, embedding=[0.1, 0.2, 0.3, 0.4], ) @@ -250,7 +239,7 @@ def test_vectorizer_with_cache_many(cached_vectorizer): # Both should now be in cache for text in TEST_TEXTS: assert cached_vectorizer.cache.exists( - text=text, model_name=cached_vectorizer.model + content=text, model_name=cached_vectorizer.model ) @@ -262,7 +251,7 @@ def test_vectorizer_with_cached_metadata(cached_vectorizer): # Verify metadata was stored in cache cached_entry = cached_vectorizer.cache.get( - text=TEST_TEXT, model_name=cached_vectorizer.model + content=TEST_TEXT, model_name=cached_vectorizer.model ) assert cached_entry["metadata"] == test_metadata @@ -280,7 +269,7 @@ async def test_vectorizer_with_cache_async(cached_vectorizer): # Verify it's actually using the cache cached_entry = await cached_vectorizer.cache.aget( - text=TEST_TEXT, model_name=cached_vectorizer.model + content=TEST_TEXT, model_name=cached_vectorizer.model ) assert cached_entry is not None assert cached_entry["embedding"] == TEST_VECTOR @@ -291,7 +280,7 @@ async def test_vectorizer_with_cache_async_many(cached_vectorizer): """Test async embedding many texts with partial cache hits/misses.""" # Store an embedding for the first text only await cached_vectorizer.cache.aset( - text=TEST_TEXTS[0], + content=TEST_TEXTS[0], model_name=cached_vectorizer.model, embedding=[0.1, 0.2, 0.3, 0.4], ) @@ -306,14 +295,14 @@ async def test_vectorizer_with_cache_async_many(cached_vectorizer): # Both should now be in cache for text in TEST_TEXTS: assert await cached_vectorizer.cache.aexists( - text=text, model_name=cached_vectorizer.model + content=text, model_name=cached_vectorizer.model ) @pytest.mark.requires_api_keys def test_bedrock_bad_credentials(): with pytest.raises(ValueError): - BedrockTextVectorizer( + BedrockVectorizer( api_config={ "aws_access_key_id": "invalid", "aws_secret_access_key": "invalid", @@ -324,61 +313,61 @@ def test_bedrock_bad_credentials(): @pytest.mark.requires_api_keys def test_bedrock_invalid_model(): with pytest.raises(ValueError): - bedrock = BedrockTextVectorizer(model="invalid-model") + bedrock = BedrockVectorizer(model="invalid-model") bedrock.embed("test") def test_custom_vectorizer_embed(custom_embed_class, custom_embed_func): - custom_wrapper = CustomTextVectorizer(embed=custom_embed_func) + custom_wrapper = CustomVectorizer(embed=custom_embed_func) embedding = custom_wrapper.embed("This is a test sentence.") assert embedding == TEST_VECTOR - custom_wrapper = CustomTextVectorizer(embed=custom_embed_class().embed) + custom_wrapper = CustomVectorizer(embed=custom_embed_class().embed) embedding = custom_wrapper.embed("This is a test sentence.") assert embedding == TEST_VECTOR - custom_wrapper = CustomTextVectorizer(embed=custom_embed_class().embed_with_args) + custom_wrapper = CustomVectorizer(embed=custom_embed_class().embed_with_args) embedding = custom_wrapper.embed("This is a test sentence.", max_len=4) assert embedding == TEST_VECTOR embedding = custom_wrapper.embed("This is a test sentence.", max_len=2) assert embedding == [1.1, 2.2] with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer(embed="hello") + invalid_vectorizer = CustomVectorizer(embed="hello") with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer(embed=42) + invalid_vectorizer = CustomVectorizer(embed=42) with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer(embed={"foo": "bar"}) + invalid_vectorizer = CustomVectorizer(embed={"foo": "bar"}) def bad_arg_type(value: int): return [value] with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer(embed=bad_arg_type) + invalid_vectorizer = CustomVectorizer(embed=bad_arg_type) def bad_return_type(text: str) -> str: return text with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer(embed=bad_return_type) + invalid_vectorizer = CustomVectorizer(embed=bad_return_type) def test_custom_vectorizer_embed_many(custom_embed_class, custom_embed_func): - custom_wrapper = CustomTextVectorizer( + custom_wrapper = CustomVectorizer( custom_embed_func, embed_many=custom_embed_class().embed_many ) embeddings = custom_wrapper.embed_many(["test one.", "test two"]) assert embeddings == [[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]] - custom_wrapper = CustomTextVectorizer( + custom_wrapper = CustomVectorizer( custom_embed_func, embed_many=custom_embed_class().embed_many ) embeddings = custom_wrapper.embed_many(["test one.", "test two"]) assert embeddings == [[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]] - custom_wrapper = CustomTextVectorizer( + custom_wrapper = CustomVectorizer( custom_embed_func, embed_many=custom_embed_class().embed_many_with_args ) embeddings = custom_wrapper.embed_many(["test one.", "test two"], param=True) @@ -387,13 +376,13 @@ def test_custom_vectorizer_embed_many(custom_embed_class, custom_embed_func): assert embeddings == [[6.0, 5.0, 4.0], [3.0, 2.0, 1.0]] with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer(custom_embed_func, embed_many="hello") + invalid_vectorizer = CustomVectorizer(custom_embed_func, embed_many="hello") with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer(custom_embed_func, embed_many=42) + invalid_vectorizer = CustomVectorizer(custom_embed_func, embed_many=42) with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer( + invalid_vectorizer = CustomVectorizer( custom_embed_func, embed_many={"foo": "bar"} ) @@ -401,7 +390,7 @@ def bad_arg_type(value: int): return [value] with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer( + invalid_vectorizer = CustomVectorizer( custom_embed_func, embed_many=bad_arg_type ) @@ -409,7 +398,7 @@ def bad_return_type(text: str) -> str: return text with pytest.raises(ValueError): - invalid_vectorizer = CustomTextVectorizer( + invalid_vectorizer = CustomVectorizer( custom_embed_func, embed_many=bad_return_type ) @@ -419,19 +408,19 @@ def bad_return_type(text: str) -> str: "vectorizer_", [ AzureOpenAITextVectorizer, - BedrockTextVectorizer, + BedrockVectorizer, CohereTextVectorizer, - CustomTextVectorizer, + CustomVectorizer, HFTextVectorizer, MistralAITextVectorizer, OpenAITextVectorizer, - VertexAITextVectorizer, - VoyageAITextVectorizer, + VertexAIVectorizer, + VoyageAIVectorizer, ], ) def test_default_dtype(vectorizer_): # test dtype defaults to float32 - if issubclass(vectorizer_, CustomTextVectorizer): + if issubclass(vectorizer_, CustomVectorizer): vectorizer = vectorizer_(embed=lambda x, input_type=None: [1.0, 2.0, 3.0]) elif issubclass(vectorizer_, AzureOpenAITextVectorizer): vectorizer = vectorizer_( @@ -448,20 +437,20 @@ def test_default_dtype(vectorizer_): "vectorizer_", [ AzureOpenAITextVectorizer, - BedrockTextVectorizer, + BedrockVectorizer, CohereTextVectorizer, - CustomTextVectorizer, + CustomVectorizer, HFTextVectorizer, MistralAITextVectorizer, OpenAITextVectorizer, - VertexAITextVectorizer, - VoyageAITextVectorizer, + VertexAIVectorizer, + VoyageAIVectorizer, ], ) def test_vectorizer_dtype_assignment(vectorizer_): # test initializing dtype in constructor for dtype in ["float16", "float32", "float64", "bfloat16", "int8", "uint8"]: - if issubclass(vectorizer_, CustomTextVectorizer): + if issubclass(vectorizer_, CustomVectorizer): vectorizer = vectorizer_(embed=lambda x: [1.0, 2.0, 3.0], dtype=dtype) elif issubclass(vectorizer_, AzureOpenAITextVectorizer): vectorizer = vectorizer_( @@ -481,13 +470,13 @@ def test_vectorizer_dtype_assignment(vectorizer_): "vectorizer_", [ AzureOpenAITextVectorizer, - BedrockTextVectorizer, + BedrockVectorizer, CohereTextVectorizer, HFTextVectorizer, MistralAITextVectorizer, OpenAITextVectorizer, - VertexAITextVectorizer, - VoyageAITextVectorizer, + VertexAIVectorizer, + VoyageAIVectorizer, ], ) def test_non_supported_dtypes(vectorizer_): @@ -507,7 +496,7 @@ async def test_vectorizer_aembed(vectorizer): text = TEST_TEXT if isinstance(vectorizer, CohereTextVectorizer): embedding = await vectorizer.aembed(text, input_type="search_document") - elif isinstance(vectorizer, VoyageAITextVectorizer): + elif isinstance(vectorizer, VoyageAIVectorizer): embedding = await vectorizer.aembed(text, input_type="document") else: embedding = await vectorizer.aembed(text) @@ -521,7 +510,7 @@ async def test_vectorizer_aembed_many(vectorizer): texts = TEST_TEXTS if isinstance(vectorizer, CohereTextVectorizer): embeddings = await vectorizer.aembed_many(texts, input_type="search_document") - elif isinstance(vectorizer, VoyageAITextVectorizer): + elif isinstance(vectorizer, VoyageAIVectorizer): embeddings = await vectorizer.aembed_many(texts, input_type="document") else: embeddings = await vectorizer.aembed_many(texts) @@ -623,3 +612,20 @@ def test_cohere_embedding_types_warning(): ) assert isinstance(embeddings, list) assert len(embeddings) == len(texts) + + +def test_deprecated_text_parameter_warning(): + """Test that using deprecated 'text' and 'texts' parameters emits deprecation warnings.""" + vectorizer = HFTextVectorizer(model="sentence-transformers/all-MiniLM-L6-v2") + + # Test single embed with deprecated 'text' parameter emits warning + with pytest.warns(DeprecationWarning, match="Argument text is deprecated"): + embedding = vectorizer.embed(text=TEST_TEXT) + assert isinstance(embedding, list) + assert len(embedding) == vectorizer.dims + + # Test embed_many with deprecated 'texts' parameter emits warning + with pytest.warns(DeprecationWarning, match="Argument texts is deprecated"): + embeddings = vectorizer.embed_many(texts=TEST_TEXTS) + assert isinstance(embeddings, list) + assert len(embeddings) == len(TEST_TEXTS) diff --git a/tests/unit/test_base_vectorizer.py b/tests/unit/test_base_vectorizer.py index b5f4ea74..ede4978b 100644 --- a/tests/unit/test_base_vectorizer.py +++ b/tests/unit/test_base_vectorizer.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Any, List, Optional from redisvl.utils.vectorize.base import BaseVectorizer @@ -15,17 +15,29 @@ class SimpleVectorizer(BaseVectorizer): model: str = "simple" dims: int = 10 - def embed(self, text: str, **kwargs) -> List[float]: + def embed(self, content: Any = "", text: Any = "", **kwargs) -> List[float]: return [0.0] * self.dims - async def aembed(self, text: str, **kwargs) -> List[float]: + async def aembed( + self, content: Any = "", text: Any = "", **kwargs + ) -> List[float]: return [0.0] * self.dims - async def aembed_many(self, texts: List[str], **kwargs) -> List[List[float]]: - return [[0.0] * self.dims] * len(texts) - - def embed_many(self, texts: List[str], **kwargs) -> List[List[float]]: - return [[0.0] * self.dims] * len(texts) + async def aembed_many( + self, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, + **kwargs, + ) -> List[List[float]]: + return [[0.0] * self.dims] * len(contents) + + def embed_many( + self, + contents: Optional[List[Any]] = None, + texts: Optional[List[Any]] = None, + **kwargs, + ) -> List[List[float]]: + return [[0.0] * self.dims] * len(contents) vectorizer = SimpleVectorizer() assert vectorizer.model == "simple" diff --git a/tests/unit/test_embedcache_schema.py b/tests/unit/test_embedcache_schema.py index a3d109da..8296f286 100644 --- a/tests/unit/test_embedcache_schema.py +++ b/tests/unit/test_embedcache_schema.py @@ -12,12 +12,12 @@ def test_valid_cache_entry_creation(): entry_id = hashify(f"What is AI?:text-embedding-ada-002") entry = CacheEntry( entry_id=entry_id, - text="What is AI?", + content="What is AI?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3], ) assert entry.entry_id == entry_id - assert entry.text == "What is AI?" + assert entry.content == "What is AI?" assert entry.model_name == "text-embedding-ada-002" assert entry.embedding == [0.1, 0.2, 0.3] @@ -25,7 +25,7 @@ def test_valid_cache_entry_creation(): def test_cache_entry_with_given_entry_id(): entry = CacheEntry( entry_id="custom_id", - text="What is AI?", + content="What is AI?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3], ) @@ -36,7 +36,7 @@ def test_cache_entry_with_invalid_metadata(): with pytest.raises(ValidationError): CacheEntry( entry_id="test_id", - text="What is AI?", + content="What is AI?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3], metadata="invalid_metadata", @@ -47,14 +47,14 @@ def test_cache_entry_to_dict(): entry_id = hashify(f"What is AI?:text-embedding-ada-002") entry = CacheEntry( entry_id=entry_id, - text="What is AI?", + content="What is AI?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3], metadata={"author": "John"}, ) result = entry.to_dict() assert result["entry_id"] == entry_id - assert result["text"] == "What is AI?" + assert result["content"] == "What is AI?" assert result["model_name"] == "text-embedding-ada-002" assert isinstance("embedding", str) assert isinstance("metadata", str) @@ -65,7 +65,7 @@ def test_cache_entry_deserialization(): """Test that a CacheEntry properly deserializes data from Redis format.""" serialized_data = { "entry_id": "test_id", - "text": "What is AI?", + "content": "What is AI?", "model_name": "text-embedding-ada-002", "embedding": json.dumps([0.1, 0.2, 0.3]), # Serialized embedding "metadata": json.dumps({"source": "user_query"}), # Serialized metadata @@ -74,7 +74,7 @@ def test_cache_entry_deserialization(): entry = CacheEntry(**serialized_data) assert entry.entry_id == "test_id" - assert entry.text == "What is AI?" + assert entry.content == "What is AI?" assert entry.model_name == "text-embedding-ada-002" assert entry.embedding == [0.1, 0.2, 0.3] # Should be deserialized assert entry.metadata == {"source": "user_query"} # Should be deserialized @@ -84,7 +84,7 @@ def test_cache_entry_deserialization(): def test_cache_entry_with_empty_optional_fields(): entry = CacheEntry( entry_id="test_id", - text="What is AI?", + content="What is AI?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3], ) @@ -96,7 +96,7 @@ def test_cache_entry_timestamp_generation(): """Test that inserted_at timestamp is automatically generated.""" entry = CacheEntry( entry_id="test_id", - text="What is AI?", + content="What is AI?", model_name="text-embedding-ada-002", embedding=[0.1, 0.2, 0.3], ) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 85a23c72..d7c1e2f0 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -16,6 +16,7 @@ assert_no_warnings, denorm_cosine_distance, deprecated_argument, + deprecated_class, deprecated_function, lazy_import, norm_cosine_distance, @@ -534,6 +535,94 @@ def test_logging_configuration_not_overridden(self): ), f"Date format changed: was present before: {has_date_pre}, present after: {has_date_post}" +class TestDeprecatedClass: + def test_deprecated_class_warning_with_replacement(self): + @deprecated_class(replacement="Use NewClass instead.") + class OldClass: + def __init__(self, value): + self.value = value + + with pytest.warns(DeprecationWarning) as record: + obj = OldClass(42) + + assert len(record) == 1 + assert str(record[0].message) == ( + "Class OldClass is deprecated and will be removed in the next major release. " + "Use NewClass instead." + ) + assert obj.value == 42 + + def test_deprecated_class_warning_without_replacement(self): + @deprecated_class() + class OldClass: + def __init__(self, value): + self.value = value + + with pytest.warns(DeprecationWarning) as record: + obj = OldClass(42) + + assert len(record) == 1 + assert str(record[0].message) == ( + "Class OldClass is deprecated and will be removed in the next major release. " + ) + assert obj.value == 42 + + def test_deprecated_class_with_custom_name(self): + @deprecated_class(name="CustomOldClass", replacement="Use NewClass instead.") + class OldClass: + pass + + with pytest.warns(DeprecationWarning) as record: + OldClass() + + assert len(record) == 1 + assert str(record[0].message) == ( + "Class CustomOldClass is deprecated and will be removed in the next major release. " + "Use NewClass instead." + ) + + def test_deprecated_class_preserves_functionality(self): + @deprecated_class(replacement="Use NewClass instead.") + class OldClass: + def __init__(self, x, y): + self.x = x + self.y = y + + def add(self): + return self.x + self.y + + with pytest.warns(DeprecationWarning): + obj = OldClass(10, 20) + + assert obj.x == 10 + assert obj.y == 20 + assert obj.add() == 30 + + def test_deprecated_class_with_inheritance(self): + @deprecated_class(replacement="Use NewBase instead.") + class OldBase: + def __init__(self, value): + self.value = value + + class Derived(OldBase): + def __init__(self, value, extra): + super().__init__(value) + self.extra = extra + + # Creating an instance of the deprecated base class should warn + with pytest.warns(DeprecationWarning): + base_obj = OldBase(42) + + # Creating an instance of the derived class should also warn + # because it calls the deprecated __init__ + with pytest.warns(DeprecationWarning): + derived_obj = Derived(42, "extra") + + assert base_obj.value == 42 + assert derived_obj.value == 42 + assert derived_obj.extra == "extra" + + class TestLazyImport: def test_import_standard_library(self): """Test lazy importing of a standard library module""" diff --git a/uv.lock b/uv.lock index 59993000..1e414bf4 100644 --- a/uv.lock +++ b/uv.lock @@ -4255,7 +4255,7 @@ wheels = [ [[package]] name = "redisvl" -version = "0.12.1" +version = "0.13.0" source = { editable = "." } dependencies = [ { name = "jsonpath-ng" }, @@ -4292,6 +4292,10 @@ nltk = [ openai = [ { name = "openai" }, ] +pillow = [ + { name = "pillow", version = "11.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "pillow", version = "12.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, +] sentence-transformers = [ { name = "sentence-transformers" }, ] @@ -4344,6 +4348,7 @@ requires-dist = [ { name = "nltk", marker = "extra == 'nltk'", specifier = ">=3.8.1,<4" }, { name = "numpy", specifier = ">=1.26.0,<3" }, { name = "openai", marker = "extra == 'openai'", specifier = ">=1.1.0" }, + { name = "pillow", marker = "extra == 'pillow'", specifier = ">=11.3.0" }, { name = "protobuf", marker = "extra == 'vertexai'", specifier = ">=5.28.0,<6.0.0" }, { name = "pydantic", specifier = ">=2,<3" }, { name = "python-ulid", specifier = ">=3.0.0" }, @@ -4354,7 +4359,7 @@ requires-dist = [ { name = "urllib3", marker = "extra == 'bedrock'", specifier = "<2.2.0" }, { name = "voyageai", marker = "extra == 'voyageai'", specifier = ">=0.2.2" }, ] -provides-extras = ["mistralai", "openai", "nltk", "cohere", "voyageai", "sentence-transformers", "langcache", "vertexai", "bedrock"] +provides-extras = ["mistralai", "openai", "nltk", "cohere", "voyageai", "sentence-transformers", "langcache", "vertexai", "bedrock", "pillow"] [package.metadata.requires-dev] dev = [