Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 59 additions & 11 deletions backends/infinity/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,68 @@ def __init__(self, model_directory: pathlib.Path):
async def load(self, **kwargs):
# Use cpu by default
device = unwrap(kwargs.get("embeddings_device"), "cpu")

# Extract device ID if specified
device_id = kwargs.get("embeddings_device_id", [])

# Handle mixed device types (CPU/CUDA conflict)
if device == "cpu" and device_id:
logger.warning("embeddings_device is set to 'cpu' but embeddings_device_id is specified. Ignoring device_id and using CPU.")
device_id = []

# Validate device ID if using CUDA
if device == "cuda" and device_id:
if not isinstance(device_id, list):
device_id = [device_id]

# Validate GPU exists
available_gpus = torch.cuda.device_count()
valid_device_ids = []

for gpu_id in device_id:
if gpu_id >= available_gpus:
logger.error(f"GPU {gpu_id} not found. Available GPUs: 0-{available_gpus-1}")
continue # Skip invalid GPU but continue checking others
else:
valid_device_ids.append(gpu_id)

# Use only valid device IDs
device_id = valid_device_ids

# Handle multiple device IDs with infinity_emb compatibility
if len(device_id) > 1:
logger.warning("infinity_emb may not support multiple GPU IDs. Using first valid GPU: {device_id[0]}")
device_id = [device_id[0]] # Use only first GPU

engine_args = EngineArgs(
model_name_or_path=str(self.model_dir),
engine="torch",
device=device,
bettertransformer=False,
model_warmup=False,
)
try:
engine_args = EngineArgs(
model_name_or_path=str(self.model_dir),
engine="torch",
device=device,
device_id=device_id, # Pass device ID to infinity_emb
bettertransformer=False,
model_warmup=False,
)

self.engine = AsyncEmbeddingEngine.from_args(engine_args)
await self.engine.astart()
self.engine = AsyncEmbeddingEngine.from_args(engine_args)
await self.engine.astart()

self.loaded = True
logger.info("Embedding model successfully loaded.")
self.loaded = True
gpu_info = f" on GPU {device_id}" if device_id else ""
logger.info(f"Embedding model successfully loaded{gpu_info}.")

except RuntimeError as e:
if "out of memory" in str(e).lower():
logger.error(f"GPU {device_id} has insufficient memory for embedding model. Error: {str(e)}")
logger.error("Try using a different GPU or loading the model on CPU.")
raise
elif "cuda" in str(e).lower() or "device" in str(e).lower():
logger.error(f"Failed to load embedding model on GPU {device_id}. Error: {str(e)}")
logger.error("The GPU may be busy or unavailable. Try using a different GPU or CPU.")
raise
else:
logger.error(f"Unexpected error loading embedding model: {str(e)}")
raise

async def unload(self):
await self.engine.astop()
Expand Down
28 changes: 28 additions & 0 deletions common/config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,39 @@ class EmbeddingsConfig(BaseConfigModel):
"If using an AMD GPU, set this value to 'cuda'."
),
)
embeddings_device_id: Optional[List[int]] = Field(
[],
description=(
"Specific GPU device IDs for embedding models (default: []).\n"
"Empty list for auto-select.\n"
"Only applies when embeddings_device is 'cuda'."
),
)
embedding_model_name: Optional[str] = Field(
None,
description=("An initial embedding model to load on the infinity backend."),
)

@field_validator("embeddings_device_id", mode="before")
@classmethod
def validate_embeddings_device_id(cls, v, info):
# Only validate if CUDA is selected
if info.data.get("embeddings_device") == "cuda" and v:
# Check if torch is available
try:
import torch
available_gpus = torch.cuda.device_count()
for gpu_id in v:
if gpu_id >= available_gpus:
raise ValueError(
f"GPU {gpu_id} not found. Available GPUs: 0-{available_gpus-1}"
)
except ImportError:
# If torch is not available, we can't validate now
# This will be caught later during model loading
pass
return v


class DeveloperConfig(BaseConfigModel):
"""Options for development and experimentation"""
Expand Down
6 changes: 6 additions & 0 deletions config_sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,12 @@ embeddings:
# If using an AMD GPU, set this value to 'cuda'.
embeddings_device: cpu

# Specific GPU device IDs for embedding models (default: []).
# Empty list for auto-select.
# Only applies when embeddings_device is 'cuda'.
# Example: [0] for first GPU, [1] for second GPU
embeddings_device_id: []

# An initial embedding model to load on the infinity backend.
embedding_model_name:

Expand Down
8 changes: 8 additions & 0 deletions docs/02.-Server-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,12 @@ Note: Most of the options here will only apply on initial embedding model load/s
| -------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------- |
| embedding_model_dir | String ("models") | Directory to look for embedding models.<br><br>Note: Persisted across subsequent load requests |
| embeddings_device | String ("cpu") | Device to load an embedding model on.<br><br>Options: cpu, cuda, auto<br><br>Note: Persisted across subsequent load requests |
| embeddings_device_id | List[int] ([]) | Specific GPU device IDs for embedding models.<br><br>Empty list for auto-select.<br><br>Only applies when embeddings_device is "cuda".<br><br>Note: If multiple GPUs are specified, only the first valid GPU will be used. |
| embedding_model_name | String (None) | Folder name of an embedding model to load using infinity-emb. |

#### Troubleshooting

- **GPU not found**: If you see "GPU X not found" error, check your GPU IDs against `nvidia-smi` output
- **Out of memory**: If GPU runs out of memory, try using a different GPU or set `embeddings_device` to "cpu"
- **GPU busy**: If model loading fails with CUDA errors, the GPU may be busy with other processes
- **Mixed device types**: If `embeddings_device` is "cpu" but `embeddings_device_id` is set, the device ID will be ignored
1 change: 1 addition & 0 deletions endpoints/core/types/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class EmbeddingModelLoadRequest(BaseModel):

# Set default from the config
embeddings_device: Optional[str] = Field(config.embeddings.embeddings_device)
embeddings_device_id: Optional[List[int]] = Field(config.embeddings.embeddings_device_id)


class ModelLoadResponse(BaseModel):
Expand Down