theroyallab · thakurvivek · Nov 15, 2025 · Nov 15, 2025
diff --git a/backends/infinity/model.py b/backends/infinity/model.py
@@ -25,20 +25,68 @@ def __init__(self, model_directory: pathlib.Path):
     async def load(self, **kwargs):
         # Use cpu by default
         device = unwrap(kwargs.get("embeddings_device"), "cpu")
+
+        # Extract device ID if specified
+        device_id = kwargs.get("embeddings_device_id", [])
+
+        # Handle mixed device types (CPU/CUDA conflict)
+        if device == "cpu" and device_id:
+            logger.warning("embeddings_device is set to 'cpu' but embeddings_device_id is specified. Ignoring device_id and using CPU.")
+            device_id = []
+
+        # Validate device ID if using CUDA
+        if device == "cuda" and device_id:
+            if not isinstance(device_id, list):
+                device_id = [device_id]
+
+            # Validate GPU exists
+            available_gpus = torch.cuda.device_count()
+            valid_device_ids = []
+
+            for gpu_id in device_id:
+                if gpu_id >= available_gpus:
+                    logger.error(f"GPU {gpu_id} not found. Available GPUs: 0-{available_gpus-1}")
+                    continue  # Skip invalid GPU but continue checking others
+                else:
+                    valid_device_ids.append(gpu_id)
+
+            # Use only valid device IDs
+            device_id = valid_device_ids
+
+            # Handle multiple device IDs with infinity_emb compatibility
+            if len(device_id) > 1:
+                logger.warning("infinity_emb may not support multiple GPU IDs. Using first valid GPU: {device_id[0]}")
+                device_id = [device_id[0]]  # Use only first GPU
 
-        engine_args = EngineArgs(
-            model_name_or_path=str(self.model_dir),
-            engine="torch",
-            device=device,
-            bettertransformer=False,
-            model_warmup=False,
-        )
+        try:
+            engine_args = EngineArgs(
+                model_name_or_path=str(self.model_dir),
+                engine="torch",
+                device=device,
+                device_id=device_id,  # Pass device ID to infinity_emb
+                bettertransformer=False,
+                model_warmup=False,
+            )
 
-        self.engine = AsyncEmbeddingEngine.from_args(engine_args)
-        await self.engine.astart()
+            self.engine = AsyncEmbeddingEngine.from_args(engine_args)
+            await self.engine.astart()
 
-        self.loaded = True
-        logger.info("Embedding model successfully loaded.")
+            self.loaded = True
+            gpu_info = f" on GPU {device_id}" if device_id else ""
+            logger.info(f"Embedding model successfully loaded{gpu_info}.")
+
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                logger.error(f"GPU {device_id} has insufficient memory for embedding model. Error: {str(e)}")
+                logger.error("Try using a different GPU or loading the model on CPU.")
+                raise
+            elif "cuda" in str(e).lower() or "device" in str(e).lower():
+                logger.error(f"Failed to load embedding model on GPU {device_id}. Error: {str(e)}")
+                logger.error("The GPU may be busy or unavailable. Try using a different GPU or CPU.")
+                raise
+            else:
+                logger.error(f"Unexpected error loading embedding model: {str(e)}")
+                raise
 
     async def unload(self):
         await self.engine.astop()

diff --git a/common/config_models.py b/common/config_models.py
@@ -420,11 +420,39 @@ class EmbeddingsConfig(BaseConfigModel):
             "If using an AMD GPU, set this value to 'cuda'."
         ),
     )
+    embeddings_device_id: Optional[List[int]] = Field(
+        [],
+        description=(
+            "Specific GPU device IDs for embedding models (default: []).\n"
+            "Empty list for auto-select.\n"
+            "Only applies when embeddings_device is 'cuda'."
+        ),
+    )
     embedding_model_name: Optional[str] = Field(
         None,
         description=("An initial embedding model to load on the infinity backend."),
     )
 
+    @field_validator("embeddings_device_id", mode="before")
+    @classmethod
+    def validate_embeddings_device_id(cls, v, info):
+        # Only validate if CUDA is selected
+        if info.data.get("embeddings_device") == "cuda" and v:
+            # Check if torch is available
+            try:
+                import torch
+                available_gpus = torch.cuda.device_count()
+                for gpu_id in v:
+                    if gpu_id >= available_gpus:
+                        raise ValueError(
+                            f"GPU {gpu_id} not found. Available GPUs: 0-{available_gpus-1}"
+                        )
+            except ImportError:
+                # If torch is not available, we can't validate now
+                # This will be caught later during model loading
+                pass
+        return v
+
 
 class DeveloperConfig(BaseConfigModel):
     """Options for development and experimentation"""

diff --git a/config_sample.yml b/config_sample.yml
@@ -218,6 +218,12 @@ embeddings:
   # If using an AMD GPU, set this value to 'cuda'.
   embeddings_device: cpu
 
+  # Specific GPU device IDs for embedding models (default: []).
+  # Empty list for auto-select.
+  # Only applies when embeddings_device is 'cuda'.
+  # Example: [0] for first GPU, [1] for second GPU
+  embeddings_device_id: []
+
   # An initial embedding model to load on the infinity backend.
   embedding_model_name:
 

diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md
@@ -107,4 +107,12 @@ Note: Most of the options here will only apply on initial embedding model load/s
 | -------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------- |
 | embedding_model_dir  | String ("models") | Directory to look for embedding models.<br><br>Note: Persisted across subsequent load requests                               |
 | embeddings_device    | String ("cpu")    | Device to load an embedding model on.<br><br>Options: cpu, cuda, auto<br><br>Note: Persisted across subsequent load requests |
+| embeddings_device_id | List[int] ([])    | Specific GPU device IDs for embedding models.<br><br>Empty list for auto-select.<br><br>Only applies when embeddings_device is "cuda".<br><br>Note: If multiple GPUs are specified, only the first valid GPU will be used. |
 | embedding_model_name | String (None)     | Folder name of an embedding model to load using infinity-emb.                                                                |
+
+#### Troubleshooting
+
+- **GPU not found**: If you see "GPU X not found" error, check your GPU IDs against `nvidia-smi` output
+- **Out of memory**: If GPU runs out of memory, try using a different GPU or set `embeddings_device` to "cpu"
+- **GPU busy**: If model loading fails with CUDA errors, the GPU may be busy with other processes
+- **Mixed device types**: If `embeddings_device` is "cpu" but `embeddings_device_id` is set, the device ID will be ignored
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
@@ -123,6 +123,7 @@ class EmbeddingModelLoadRequest(BaseModel):
 
     # Set default from the config
     embeddings_device: Optional[str] = Field(config.embeddings.embeddings_device)
+    embeddings_device_id: Optional[List[int]] = Field(config.embeddings.embeddings_device_id)
 
 
 class ModelLoadResponse(BaseModel):