From d5e71ff860e926cc1877a5f1bba1a00617a23e7a Mon Sep 17 00:00:00 2001 From: Yvonne Yu Date: Tue, 5 May 2026 13:38:52 -0700 Subject: [PATCH] feat: migrate model garden to agentplatform PiperOrigin-RevId: 910885525 --- .kokoro/docker/docs/Dockerfile | 5 +- agentplatform/__init__.py | 15 + agentplatform/model_garden/README.md | 214 ++ agentplatform/model_garden/__init__.py | 27 + agentplatform/model_garden/_model_garden.py | 1537 +++++++++++++ agentplatform/preview/__init__.py | 15 + agentplatform/preview/model_garden.py | 31 + .../model_garden/test_model_garden.py | 8 +- .../test_vertexai_model_garden.py | 1999 +++++++++++++++++ tests/unit/vertexai/test_rubric_based_eval.py | 3 +- 10 files changed, 3848 insertions(+), 6 deletions(-) create mode 100644 agentplatform/model_garden/README.md create mode 100644 agentplatform/model_garden/__init__.py create mode 100644 agentplatform/model_garden/_model_garden.py create mode 100644 agentplatform/preview/__init__.py create mode 100644 agentplatform/preview/model_garden.py rename tests/unit/{vertexai => agentplatform}/model_garden/test_model_garden.py (99%) create mode 100644 tests/unit/vertexai/model_garden/test_vertexai_model_garden.py diff --git a/.kokoro/docker/docs/Dockerfile b/.kokoro/docker/docs/Dockerfile index 93abeb68b7..a3668cbb93 100644 --- a/.kokoro/docker/docs/Dockerfile +++ b/.kokoro/docker/docs/Dockerfile @@ -21,7 +21,10 @@ ENV PATH /usr/local/bin:$PATH # Install dependencies. RUN apt-get update \ - && apt-get install -y --no-install-recommends \ + && apt-get install -y ca-certificates --fix-missing \ + && update-ca-certificates + +RUN apt-get install -y --no-install-recommends \ apt-transport-https \ build-essential \ ca-certificates \ diff --git a/agentplatform/__init__.py b/agentplatform/__init__.py index 913bdcffb7..9756768002 100644 --- a/agentplatform/__init__.py +++ b/agentplatform/__init__.py @@ -14,11 +14,26 @@ # """The agentplatform module.""" +import importlib from google.cloud.aiplatform import init from google.cloud.aiplatform import version as aiplatform_version __version__ = aiplatform_version.__version__ + +def __getattr__(name): # type: ignore[no-untyped-def] + if name == "preview": + # We need to import carefully to avoid `RecursionError`. + # This won't work since it causes `RecursionError`: + # `from agentplatform import preview` + # This won't work due to Copybara lacking a transform: + # `import google.cloud.aiplatform.agentplatform.preview as` + # `agentplatform_preview` + return importlib.import_module(".preview", __name__) + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + + __all__ = [ "init", + "preview", ] diff --git a/agentplatform/model_garden/README.md b/agentplatform/model_garden/README.md new file mode 100644 index 0000000000..5fc942a5d1 --- /dev/null +++ b/agentplatform/model_garden/README.md @@ -0,0 +1,214 @@ +# Gemini Enterprise Agent Platform Model Garden SDK for Python + +The Gemini Enterprise Agent Platform Model Garden SDK helps developers use [Model Garden](https://cloud.google.com/model-garden) open models to build AI-powered features and applications. +The SDKs support use cases like the following: + +- Deploy an open model +- Export open model weights + +## Installation + +To install the +[google-cloud-aiplatform](https://pypi.org/project/google-cloud-aiplatform/) +Python package, run the following command: + +```shell +pip3 install --upgrade --user "google-cloud-aiplatform>=1.84" +``` + +## Usage + +For detailed instructions, see [deploy an open model](https://cloud.google.com/vertex-ai/generative-ai/docs/model-garden/use-models#deploy_an_open_model) and [deploy notebook tutorial](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_deployment_tutorial.ipynb). + +## Quick Start: Default Deployment + +This is the simplest way to deploy a model. If you provide just a model name, the SDK will use the default deployment configuration. + +```python +from agentplatform import model_garden + +model = model_garden.OpenModel("google/paligemma@paligemma-224-float32") +endpoint = model.deploy() +``` + +**Use case:** Fast prototyping, first-time users evaluating model outputs. + +## List Deployable Models + +You can list all models that are currently deployable via Model Garden: + +```python +from agentplatform import model_garden + +models = model_garden.list_deployable_models() +``` + +To filter only Hugging Face models or by keyword: + +```python +models = model_garden.list_deployable_models(list_hf_models=True, model_filter="stable-diffusion") +``` + +**Use case:** Discover available models before deciding which one to deploy. + +## Hugging Face Model Deployment + +Deploy a model directly from Hugging Face using the model ID. + +```python +model = model_garden.OpenModel("Qwen/Qwen2-1.5B-Instruct") +endpoint = model.deploy() +``` + +**Use case:** Leverage community or third-party models without custom container setup. If the model is gated, you may need to provide a Hugging Face access token: + +```python +endpoint = model.deploy(hugging_face_access_token="your_hf_token") +``` + +**Use case:** Deploy gated Hugging Face models requiring authentication. + +## List Deployment Configurations + +You can inspect available deployment configurations for a model: + +```python +model = model_garden.OpenModel("google/paligemma@paligemma-224-float32") +deploy_options = model.list_deploy_options() +``` + +**Use case:** Evaluate compatible machine specs and containers before deployment. + +## Select a Verified Deployment: By Container Image + +Specify a container image from the list of verified deployment configurations. + +```python +endpoint = model.deploy( + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250430_0916_RC00_maas", +) +``` + +## Select a Verified Deployment: By Hardware + +Specify a hardware configuration from the list of verified deployment configurations. + +```python +endpoints = model.deploy( + machine_type="a3-highgpu-1g", + accelerator_type="NVIDIA_H100_80GB", + accelerator_count=1, +) +``` + +## Select a Verified Deployment: By Container and Hardware + +Specify both a container image and a hardware configuration from the list of verified deployment configurations. + +```python +endpoint = model.deploy( + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250430_0916_RC00_maas", + machine_type="a3-highgpu-1g", + accelerator_type="NVIDIA_H100_80GB", + accelerator_count=1, +) +``` + +**Use case:** Production configuration, performance tuning, scaling. + +## EULA Acceptance + +Some models require acceptance of a license agreement. Pass `eula=True` if prompted. + +```python +model = model_garden.OpenModel("google/gemma2@gemma-2-27b-it") +endpoint = model.deploy(eula=True) +``` + +**Use case:** First-time deployment of EULA-protected models. + +## Spot VM Deployment + +Schedule workloads on Spot VMs for lower cost. + +```python +endpoint = model.deploy(spot=True) +``` + +**Use case:** Cost-sensitive development and batch workloads. + +## Fast Tryout Deployment + +Enable experimental fast-deploy path for popular models. + +```python +endpoint = model.deploy(fast_tryout_enabled=True) +``` + +**Use case:** Interactive experimentation without full production setup. + +## Dedicated Endpoints + +Create a dedicated DNS-isolated endpoint. + +```python +endpoint = model.deploy(use_dedicated_endpoint=True) +``` + +**Use case:** Traffic isolation for enterprise or regulated workloads. + +## Reservation Affinity + +Use shared or specific Compute Engine reservations. + +```python +endpoint = model.deploy( + reservation_affinity_type="SPECIFIC_RESERVATION", + reservation_affinity_key="compute.googleapis.com/reservation-name", + reservation_affinity_values="projects/YOUR_PROJECT/zones/YOUR_ZONE/reservations/YOUR_RESERVATION" +) +``` + +**Use case:** Optimized resource usage with pre-reserved capacity. + +## Custom Container Image + +Override the default container with a custom image. + +```python +endpoint = model.deploy( + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/custom-container:latest" +) +``` + +**Use case:** Use of custom inference servers or fine-tuned environments. + +## Advanced Full Container Configuration + +Further customize startup probes, health checks, shared memory, and gRPC ports. + +```python +endpoint = model.deploy( + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/custom-container:latest", + container_command=["python3"], + container_args=["serve.py"], + container_ports=[8888], + container_env_vars={"ENV": "prod"}, + container_predict_route="/predict", + container_health_route="/health", + serving_container_shared_memory_size_mb=512, + serving_container_grpc_ports=[9000], + serving_container_startup_probe_exec=["/bin/check-start.sh"], + serving_container_health_probe_exec=["/bin/health-check.sh"] +) +``` + +**Use case:** Production-grade deployments requiring deep customization of runtime behavior and monitoring. + +## Contributing + +See [Contributing](https://github.com/googleapis/python-aiplatform/blob/main/CONTRIBUTING.rst) for more information on contributing to the Gemini Enterprise Agent Platform Python SDK. + +## License + +The contents of this repository are licensed under the [Apache License, version 2.0](http://www.apache.org/licenses/LICENSE-2.0). \ No newline at end of file diff --git a/agentplatform/model_garden/__init__.py b/agentplatform/model_garden/__init__.py new file mode 100644 index 0000000000..589f3f77fa --- /dev/null +++ b/agentplatform/model_garden/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Classes and functions for working with Model Garden.""" + +# We just want to re-export certain classes +# pylint: disable=g-multiple-import,g-importing-member +from agentplatform.model_garden import _model_garden + +OpenModel = _model_garden.OpenModel +PartnerModel = _model_garden.PartnerModel +list_deployable_models = _model_garden.list_deployable_models +list_models = _model_garden.list_models + +__all__ = ("OpenModel", "PartnerModel", "list_deployable_models", "list_models") diff --git a/agentplatform/model_garden/_model_garden.py b/agentplatform/model_garden/_model_garden.py new file mode 100644 index 0000000000..5b80f7e71d --- /dev/null +++ b/agentplatform/model_garden/_model_garden.py @@ -0,0 +1,1537 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# pylint: disable=bad-continuation, line-too-long, protected-access +"""Class for interacting with Model Garden OSS models.""" + +import datetime +import functools +import re +from typing import Dict, List, Optional, Sequence, Union + +from google.cloud import aiplatform +from google.cloud.aiplatform import base +from google.cloud.aiplatform import compat +from google.cloud.aiplatform import initializer +from google.cloud.aiplatform import models as aiplatform_models +from google.cloud.aiplatform import utils +from google.cloud.aiplatform_v1beta1 import types +from google.cloud.aiplatform_v1beta1.services import model_garden_service +from google.cloud.aiplatform_v1beta1.services import model_service +from agentplatform import batch_prediction + + +from google.protobuf import duration_pb2 + + +_LOGGER = base.Logger(__name__) +_DEFAULT_VERSION = compat.V1BETA1 +_DEFAULT_TIMEOUT = 2 * 60 * 60 # 2 hours, same as UI one-click deployment. +_DEFAULT_RECOMMEND_SPEC_TIMEOUT = 1 * 60 # 1 minute. +_DEFAULT_EXPORT_TIMEOUT = 1 * 60 * 60 # 1 hour. +_HF_WILDCARD_FILTER = "is_hf_wildcard(true)" +_NATIVE_MODEL_FILTER = "is_hf_wildcard(false)" +_VERIFIED_DEPLOYMENT_FILTER = ( + "labels.VERIFIED_DEPLOYMENT_CONFIG=VERIFIED_DEPLOYMENT_SUCCEED" +) + + +def list_deployable_models( + *, list_hf_models: bool = False, model_filter: Optional[str] = None +) -> List[str]: + """Lists the deployable models in Model Garden. + + Args: + list_hf_models: Whether to list the Hugging Face models. + model_filter: Optional. A string to filter the models by. + + Returns: + The names of the deployable models in Model Garden in the format of + `{publisher}/{model}@{version}` or Hugging Face model ID in the format + of `{organization}/{model}`. + """ + + filter_str = _NATIVE_MODEL_FILTER + if list_hf_models: + filter_str = " AND ".join([_HF_WILDCARD_FILTER, _VERIFIED_DEPLOYMENT_FILTER]) + if model_filter: + filter_str = ( + f'{filter_str} AND (model_user_id=~"(?i).*{model_filter}.*" OR' + f' display_name=~"(?i).*{model_filter}.*")' + ) + + request = types.ListPublisherModelsRequest( + parent="publishers/*", + list_all_versions=True, + filter=filter_str, + ) + client = initializer.global_config.create_client( + client_class=_ModelGardenClientWithOverride, + credentials=initializer.global_config.credentials, + location_override="us-central1", + ) + response = client.list_publisher_models(request) + output = [] + for page in response.pages: + for model in page.publisher_models: + if model.supported_actions.multi_deploy_vertex.multi_deploy_vertex: + output.append( + re.sub(r"publishers/(hf-|)|models/", "", model.name) + + ("" if list_hf_models else ("@" + model.version_id)) + ) + return output + + +def list_models( + *, list_hf_models: bool = False, model_filter: Optional[str] = None +) -> List[str]: + """Lists the models in Model Garden. + + Args: + list_hf_models: Whether to list the Hugging Face models. + model_filter: Optional. A string to filter the models by. + + Returns: + The names of the models in Model Garden in the format of + `{publisher}/{model}@{version}` or Hugging Face model ID in the format + of `{organization}/{model}`. + """ + filter_str = _NATIVE_MODEL_FILTER + if list_hf_models: + filter_str = _HF_WILDCARD_FILTER + if model_filter: + filter_str = ( + f'{filter_str} AND (model_user_id=~"(?i).*{model_filter}.*" OR' + f' display_name=~"(?i).*{model_filter}.*")' + ) + + request = types.ListPublisherModelsRequest( + parent="publishers/*", + list_all_versions=True, + filter=filter_str, + ) + client = initializer.global_config.create_client( + client_class=_ModelGardenClientWithOverride, + credentials=initializer.global_config.credentials, + location_override="us-central1", + ) + response = client.list_publisher_models(request) + output = [] + for page in response.pages: + for model in page.publisher_models: + output.append( + re.sub(r"publishers/(hf-|)|models/", "", model.name) + + ("" if list_hf_models else ("@" + model.version_id)) + ) + return output + + +def _is_hugging_face_model(model_name: str) -> bool: + """Returns whether the model is a Hugging Face model.""" + return re.match(r"^(?P[^/]+)/(?P[^/@]+)$", model_name) + + +def _get_publisher_model_resource_name(publisher: str, model: str) -> str: + """Returns the resource name. + + Args: + publisher: Publisher of the model. + model: Model name, may or may not include version. + + Returns: + The resource name in the format of + `publishers/{publisher}/models/{model_user_id}@{version_id}`. + """ + return f"publishers/{publisher}/models/{model}" + + +def _reconcile_model_name(model_name: str) -> str: + """Returns the resource name from the model name. + + Args: + model_name: Model Garden model resource name in the format of + `publishers/{publisher}/models/{model}@{version}`, or a simplified + resource name in the format of `{publisher}/{model}@{version}`, or a + Hugging Face model ID in the format of `{organization}/{model}`. + + Returns: + The resource name in the format of + `publishers/{publisher}/models/{model}@{version}`. + """ + model_name = model_name.lower() # Use lower case for Hugging Face. + full_resource_name_match = re.match( + r"^publishers/(?P[^/]+)/models/(?P[^@]+)@(?P[^@]+)$", + model_name, + ) + if full_resource_name_match: + return _get_publisher_model_resource_name( + full_resource_name_match.group("publisher"), + full_resource_name_match.group("model") + + "@" + + full_resource_name_match.group("version"), + ) + else: + simplified_name_match = re.match( + r"^(?P[^/]+)/(?P[^@]+)(?:@(?P.+))?$", + model_name, + ) + if simplified_name_match: + if simplified_name_match.group("version"): + return _get_publisher_model_resource_name( + publisher=simplified_name_match.group("publisher"), + model=simplified_name_match.group("model") + + "@" + + simplified_name_match.group("version"), + ) + else: + return _get_publisher_model_resource_name( + publisher=simplified_name_match.group("publisher"), + model=simplified_name_match.group("model"), + ) + else: + raise ValueError(f"`{model_name}` is not a valid Open Model name") + + +def _construct_serving_container_spec( + serving_container_image_uri: Optional[str] = None, + serving_container_predict_route: Optional[str] = None, + serving_container_health_route: Optional[str] = None, + serving_container_command: Optional[Sequence[str]] = None, + serving_container_args: Optional[Sequence[str]] = None, + serving_container_environment_variables: Optional[Dict[str, str]] = None, + serving_container_ports: Optional[Sequence[int]] = None, + serving_container_grpc_ports: Optional[Sequence[int]] = None, + serving_container_deployment_timeout: Optional[int] = None, + serving_container_shared_memory_size_mb: Optional[int] = None, + serving_container_startup_probe_exec: Optional[Sequence[str]] = None, + serving_container_startup_probe_period_seconds: Optional[int] = None, + serving_container_startup_probe_timeout_seconds: Optional[int] = None, + serving_container_health_probe_exec: Optional[Sequence[str]] = None, + serving_container_health_probe_period_seconds: Optional[int] = None, + serving_container_health_probe_timeout_seconds: Optional[int] = None, +) -> types.ModelContainerSpec: + """Constructs a ServingContainerSpec from the proto.""" + env = None + ports = None + grpc_ports = None + deployment_timeout = ( + duration_pb2.Duration(seconds=serving_container_deployment_timeout) + if serving_container_deployment_timeout + else None + ) + startup_probe = None + health_probe = None + + if serving_container_environment_variables: + env = [ + types.EnvVar(name=str(key), value=str(value)) + for key, value in serving_container_environment_variables.items() + ] + if serving_container_ports: + ports = [types.Port(container_port=port) for port in serving_container_ports] + if serving_container_grpc_ports: + grpc_ports = [ + types.Port(container_port=port) for port in serving_container_grpc_ports + ] + if ( + serving_container_startup_probe_exec + or serving_container_startup_probe_period_seconds + or serving_container_startup_probe_timeout_seconds + ): + startup_probe_exec = None + if serving_container_startup_probe_exec: + startup_probe_exec = types.Probe.ExecAction( + command=serving_container_startup_probe_exec + ) + startup_probe = types.Probe( + exec=startup_probe_exec, + period_seconds=serving_container_startup_probe_period_seconds, + timeout_seconds=serving_container_startup_probe_timeout_seconds, + ) + if ( + serving_container_health_probe_exec + or serving_container_health_probe_period_seconds + or serving_container_health_probe_timeout_seconds + ): + health_probe_exec = None + if serving_container_health_probe_exec: + health_probe_exec = types.Probe.ExecAction( + command=serving_container_health_probe_exec + ) + health_probe = types.Probe( + exec=health_probe_exec, + period_seconds=serving_container_health_probe_period_seconds, + timeout_seconds=serving_container_health_probe_timeout_seconds, + ) + + return types.ModelContainerSpec( + image_uri=serving_container_image_uri, + command=serving_container_command, + args=serving_container_args, + env=env, + ports=ports, + grpc_ports=grpc_ports, + predict_route=serving_container_predict_route, + health_route=serving_container_health_route, + deployment_timeout=deployment_timeout, + shared_memory_size_mb=serving_container_shared_memory_size_mb, + startup_probe=startup_probe, + health_probe=health_probe, + ) + + +class _ModelGardenClientWithOverride(utils.ClientWithOverride): + _is_temporary = True + _default_version = _DEFAULT_VERSION + _version_map = ( + ( + _DEFAULT_VERSION, + model_garden_service.ModelGardenServiceClient, + ), + ) + + +class _ModelServiceClientWithOverride(utils.ClientWithOverride): + _is_temporary = True + _default_version = _DEFAULT_VERSION + _version_map = ( + ( + _DEFAULT_VERSION, + model_service.ModelServiceClient, + ), + ) + + +class OpenModel: + """Represents a Model Garden Open model. + + Attributes: + model_name: Model Garden model resource name in the format of + `publishers/{publisher}/models/{model}@{version}`, or a simplified + resource name in the format of `{publisher}/{model}@{version}`, or a + Hugging Face model ID in the format of `{organization}/{model}`. + """ + + __module__ = "agentplatform.model_garden" + + def __init__( + self, + model_name: str, + ): + r"""Initializes a Model Garden model. + + Usage: + + ``` + model = OpenModel("publishers/google/models/gemma2@gemma-2-2b-it") + ``` + + Args: + model_name: Model Garden model resource name in the format of + `publishers/{publisher}/models/{model}@{version}`, or a simplified + resource name in the format of `{publisher}/{model}@{version}`, or a + Hugging Face model ID in the format of `{organization}/{model}`. + """ + project = initializer.global_config.project + location = initializer.global_config.location + credentials = initializer.global_config.credentials + + self._model_name = model_name + self._is_hugging_face_model = _is_hugging_face_model(model_name) + self._publisher_model_name = _reconcile_model_name(model_name) + self._project = project + self._location = location + self._credentials = credentials + + @functools.cached_property + def _model_garden_client( + self, + ) -> model_garden_service.ModelGardenServiceClient: + """Returns the Model Garden client.""" + return initializer.global_config.create_client( + client_class=_ModelGardenClientWithOverride, + credentials=self._credentials, + location_override=self._location, + ) + + @functools.cached_property + def _us_central1_model_garden_client( + self, + ) -> model_garden_service.ModelGardenServiceClient: + """Returns the Model Garden client in us-central1.""" + return initializer.global_config.create_client( + client_class=_ModelGardenClientWithOverride, + credentials=self._credentials, + location_override="us-central1", + ) + + def export( + self, + target_gcs_path: str = "", + export_request_timeout: Optional[float] = None, + ) -> str: + """Exports an Open Model to a google cloud storage bucket. + + Args: + target_gcs_path: target gcs path. + export_request_timeout: The timeout for the deploy request. Default is 2 + hours. + + Returns: + str: the target gcs bucket where the model weights are downloaded to + + + Raises: + ValueError: If ``target_gcs_path`` is not specified + """ + if not target_gcs_path: + raise ValueError("target_gcs_path is required.") + + request = types.ExportPublisherModelRequest( + parent=f"projects/{self._project}/locations/{self._location}", + name=self._publisher_model_name, + destination=types.GcsDestination(output_uri_prefix=target_gcs_path), + ) + request_headers = [ + ("x-goog-user-project", "{}".format(initializer.global_config.project)), + ] + + _LOGGER.info(f"Exporting model weights: {self._model_name}") + + operation_future = self._model_garden_client.export_publisher_model( + request, metadata=request_headers + ) + _LOGGER.info(f"LRO: {operation_future.operation.name}") + + _LOGGER.info(f"Start time: {datetime.datetime.now()}") + export_publisher_model_response = operation_future.result( + timeout=export_request_timeout or _DEFAULT_EXPORT_TIMEOUT + ) + _LOGGER.info(f"End time: {datetime.datetime.now()}") + _LOGGER.info(f"Response: {export_publisher_model_response}") + + return export_publisher_model_response.destination_uri + + def deploy( + self, + accept_eula: bool = False, + hugging_face_access_token: Optional[str] = None, + machine_type: Optional[str] = None, + min_replica_count: int = 1, + max_replica_count: int = 1, + accelerator_type: Optional[str] = None, + accelerator_count: Optional[int] = None, + spot: bool = False, + reservation_affinity_type: Optional[str] = None, + reservation_affinity_key: Optional[str] = None, + reservation_affinity_values: Optional[List[str]] = None, + use_dedicated_endpoint: Optional[bool] = False, + dedicated_endpoint_disabled: Optional[bool] = False, + fast_tryout_enabled: Optional[bool] = False, + system_labels: Optional[Dict[str, str]] = None, + endpoint_display_name: Optional[str] = None, + model_display_name: Optional[str] = None, + deploy_request_timeout: Optional[float] = None, + serving_container_spec: Optional[types.ModelContainerSpec] = None, + serving_container_image_uri: Optional[str] = None, + serving_container_predict_route: Optional[str] = None, + serving_container_health_route: Optional[str] = None, + serving_container_command: Optional[Sequence[str]] = None, + serving_container_args: Optional[Sequence[str]] = None, + serving_container_environment_variables: Optional[Dict[str, str]] = None, + serving_container_ports: Optional[Sequence[int]] = None, + serving_container_grpc_ports: Optional[Sequence[int]] = None, + serving_container_deployment_timeout: Optional[int] = None, + serving_container_shared_memory_size_mb: Optional[int] = None, + serving_container_startup_probe_exec: Optional[Sequence[str]] = None, + serving_container_startup_probe_period_seconds: Optional[int] = None, + serving_container_startup_probe_timeout_seconds: Optional[int] = None, + serving_container_health_probe_exec: Optional[Sequence[str]] = None, + serving_container_health_probe_period_seconds: Optional[int] = None, + serving_container_health_probe_timeout_seconds: Optional[int] = None, + enable_private_service_connect: bool = False, + psc_project_allow_list: Optional[Sequence[str]] = None, + ) -> aiplatform.Endpoint: + """Deploys an Open Model to an endpoint. + + Args: + accept_eula (bool): Whether to accept the End User License Agreement. + hugging_face_access_token (str): The access token to access Hugging Face + models. Reference: https://huggingface.co/docs/hub/en/security-tokens + machine_type (str): Optional. The type of machine. Not specifying + machine type will result in model to be deployed with automatic + resources. + min_replica_count (int): Optional. The minimum number of machine + replicas this deployed model will be always deployed on. If traffic + against it increases, it may dynamically be deployed onto more + replicas, and as traffic decreases, some of these extra replicas may + be freed. + max_replica_count (int): Optional. The maximum number of replicas this + deployed model may be deployed on when the traffic against it + increases. If requested value is too large, the deployment will error, + but if deployment succeeds then the ability to scale the model to that + many replicas is guaranteed (barring service outages). If traffic + against the deployed model increases beyond what its replicas at + maximum may handle, a portion of the traffic will be dropped. If this + value is not provided, the larger value of min_replica_count or 1 will + be used. If value provided is smaller than min_replica_count, it will + automatically be increased to be min_replica_count. + accelerator_type (str): Optional. Hardware accelerator type. Must also + set accelerator_count if used. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, + NVIDIA_TESLA_P4, NVIDIA_TESLA_T4 + accelerator_count (int): Optional. The number of accelerators to attach + to a worker replica. + spot (bool): Optional. Whether to schedule the deployment workload on + spot VMs. + reservation_affinity_type (str): Optional. The type of reservation + affinity. One of NO_RESERVATION, ANY_RESERVATION, + SPECIFIC_RESERVATION, SPECIFIC_THEN_ANY_RESERVATION, + SPECIFIC_THEN_NO_RESERVATION + reservation_affinity_key (str): Optional. Corresponds to the label key + of a reservation resource. To target a SPECIFIC_RESERVATION by name, + use `compute.googleapis.com/reservation-name` as the key and specify + the name of your reservation as its value. + reservation_affinity_values (List[str]): Optional. Corresponds to the + label values of a reservation resource. This must be the full resource + name of the reservation. + Format: + 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}' + use_dedicated_endpoint (bool): Optional. Default value is False. If set + to True, the underlying prediction call will be made using the + dedicated endpoint dns. + dedicated_endpoint_disabled (bool): Optional. Default value is False. If set + to False, the underlying prediction call will be made using the + dedicated endpoint dns. Otherwise, the prediction call will be made + using the shared endpoint dns. + fast_tryout_enabled (bool): Optional. Defaults to False. If True, model + will be deployed using faster deployment path. Useful for quick + experiments. Not for production workloads. Only available for most + popular models with certain machine types. + system_labels (Dict[str, str]): Optional. System labels for Model Garden + deployments. These labels are managed by Google and for tracking + purposes only. + endpoint_display_name: The display name of the created endpoint. + model_display_name: The display name of the uploaded model. + deploy_request_timeout: The timeout for the deploy request. Default is 2 + hours. + serving_container_spec (types.ModelContainerSpec): Optional. The + container specification for the model instance. This specification + overrides the default container specification and other serving + container parameters. + serving_container_image_uri (str): Optional. The URI of the Model + serving container. This parameter is required if the parameter + `local_model` is not specified. + serving_container_predict_route (str): Optional. An HTTP path to send + prediction requests to the container, and which must be supported by + it. If not specified a default HTTP path will be used by Gemini Enterprise Agent Platform. + serving_container_health_route (str): Optional. An HTTP path to send + health check requests to the container, and which must be supported by + it. If not specified a standard HTTP path will be used by Gemini Enterprise Agent Platform. + serving_container_command: Optional[Sequence[str]]=None, The command + with which the container is run. Not executed within a shell. The + Docker image's ENTRYPOINT is used if this is not provided. Variable + references $(VAR_NAME) are expanded using the container's environment. + If a variable cannot be resolved, the reference in the input string + will be unchanged. The $(VAR_NAME) syntax can be escaped with a double + $$, ie: $$(VAR_NAME). Escaped references will never be expanded, + regardless of whether the variable exists or not. + serving_container_args: Optional[Sequence[str]]=None, The arguments to + the command. The Docker image's CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's + environment. If a variable cannot be resolved, the reference in the + input string will be unchanged. The $(VAR_NAME) syntax can be escaped + with a double $$, ie: $$(VAR_NAME). Escaped references will never be + expanded, regardless of whether the variable exists or not. + serving_container_environment_variables: Optional[Dict[str, str]]=None, + The environment variables that are to be present in the container. + Should be a dictionary where keys are environment variable names and + values are environment variable values for those names. + serving_container_ports: Optional[Sequence[int]]=None, Declaration of + ports that are exposed by the container. This field is primarily + informational, it gives Gemini Enterprise Agent Platform information about the network + connections the container uses. Listing or not a port here has no + impact on whether the port is actually exposed, any port listening on + the default "0.0.0.0" address inside a container will be accessible + from the network. + serving_container_grpc_ports: Optional[Sequence[int]]=None, Declaration + of ports that are exposed by the container. Gemini Enterprise Agent + Platform sends gRPC + prediction requests that it receives to the first port on this list. + Gemini Enterprise Agent Platform also sends liveness and health checks to this port. If you + do not specify this field, gRPC requests to the container will be + disabled. Gemini Enterprise Agent Platform does not use ports other than the first one + listed. This field corresponds to the `ports` field of the Kubernetes + Containers v1 core API. + serving_container_deployment_timeout (int): Optional. Deployment timeout + in seconds. + serving_container_shared_memory_size_mb (int): Optional. The amount of + the VM memory to reserve as the shared memory for the model in + megabytes. + serving_container_startup_probe_exec (Sequence[str]): Optional. Exec + specifies the action to take. Used by startup probe. An example of + this argument would be ["cat", "/tmp/healthy"] + serving_container_startup_probe_period_seconds (int): Optional. How + often (in seconds) to perform the startup probe. Default to 10 + seconds. Minimum value is 1. + serving_container_startup_probe_timeout_seconds (int): Optional. Number + of seconds after which the startup probe times out. Defaults to 1 + second. Minimum value is 1. + serving_container_health_probe_exec (Sequence[str]): Optional. Exec + specifies the action to take. Used by health probe. An example of this + argument would be ["cat", "/tmp/healthy"] + serving_container_health_probe_period_seconds (int): Optional. How often + (in seconds) to perform the health probe. Default to 10 seconds. + Minimum value is 1. + serving_container_health_probe_timeout_seconds (int): Optional. Number + of seconds after which the health probe times out. Defaults to 1 + second. Minimum value is 1. + enable_private_service_connect (bool): Whether to enable private service + connect. + psc_project_allow_list (Sequence[str]): The list of projects that are + allowed to access the endpoint over private service connect. + + Returns: + endpoint (aiplatform.Endpoint): + Created endpoint. + + Raises: + ValueError: If ``serving_container_spec`` is specified but + ``serving_container_spec.image_uri`` + is ``None``, or if ``serving_container_spec`` is specified but other + serving container parameters are specified. + """ + request = types.DeployRequest( + destination=f"projects/{self._project}/locations/{self._location}", + ) + if self._is_hugging_face_model: + request.hugging_face_model_id = self._model_name.lower() + else: + request.publisher_model_name = self._publisher_model_name + + if endpoint_display_name: + request.endpoint_config.endpoint_display_name = endpoint_display_name + if model_display_name: + request.model_config.model_display_name = model_display_name + + if accept_eula: + request.model_config.accept_eula = accept_eula + + if hugging_face_access_token: + request.model_config.hugging_face_access_token = hugging_face_access_token + + provided_custom_machine_spec = ( + machine_type or accelerator_type or accelerator_count + ) + if provided_custom_machine_spec: + dedicated_resources = types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type=machine_type, + accelerator_type=accelerator_type, + accelerator_count=accelerator_count, + ), + min_replica_count=min_replica_count, + max_replica_count=max_replica_count, + ) + request.deploy_config.dedicated_resources = dedicated_resources + if spot: + request.deploy_config.dedicated_resources.spot = True + + if reservation_affinity_type: + request.deploy_config.dedicated_resources.machine_spec.reservation_affinity.reservation_affinity_type = ( + reservation_affinity_type + ) + if reservation_affinity_key and reservation_affinity_values: + request.deploy_config.dedicated_resources.machine_spec.reservation_affinity.key = ( + reservation_affinity_key + ) + request.deploy_config.dedicated_resources.machine_spec.reservation_affinity.values = ( + reservation_affinity_values + ) + + # TODO(b/417560875): Remove this once notebooks are migrated to use dedicated_endpoint_disabled. + if use_dedicated_endpoint: + request.endpoint_config.dedicated_endpoint_enabled = use_dedicated_endpoint + + if dedicated_endpoint_disabled: + request.endpoint_config.dedicated_endpoint_disabled = ( + dedicated_endpoint_disabled + ) + + if enable_private_service_connect and psc_project_allow_list: + request.endpoint_config.private_service_connect_config = ( + types.PrivateServiceConnectConfig( + enable_private_service_connect=enable_private_service_connect, + project_allowlist=psc_project_allow_list, + ) + ) + + if fast_tryout_enabled: + request.deploy_config.fast_tryout_enabled = fast_tryout_enabled + + if system_labels: + request.deploy_config.system_labels = system_labels + + if serving_container_spec: + if not serving_container_spec.image_uri: + raise ValueError( + "Serving container image uri is required for the serving container" + " spec." + ) + if serving_container_image_uri: + raise ValueError( + "Serving container image uri is already set in the serving" + " container spec." + ) + request.model_config.container_spec = serving_container_spec + + if serving_container_image_uri: + request.model_config.container_spec = _construct_serving_container_spec( + serving_container_image_uri, + serving_container_predict_route, + serving_container_health_route, + serving_container_command, + serving_container_args, + serving_container_environment_variables, + serving_container_ports, + serving_container_grpc_ports, + serving_container_deployment_timeout, + serving_container_shared_memory_size_mb, + serving_container_startup_probe_exec, + serving_container_startup_probe_period_seconds, + serving_container_startup_probe_timeout_seconds, + serving_container_health_probe_exec, + serving_container_health_probe_period_seconds, + serving_container_health_probe_timeout_seconds, + ) + + _LOGGER.info(f"Deploying model: {self._model_name}") + + operation_future = self._model_garden_client.deploy(request) + _LOGGER.info(f"LRO: {operation_future.operation.name}") + + _LOGGER.info(f"Start time: {datetime.datetime.now()}") + deploy_response = operation_future.result( + timeout=deploy_request_timeout or _DEFAULT_TIMEOUT + ) + _LOGGER.info(f"End time: {datetime.datetime.now()}") + + self._endpoint_name = deploy_response.endpoint + _LOGGER.info(f"Endpoint: {self._endpoint_name}") + endpoint = aiplatform.Endpoint._construct_sdk_resource_from_gapic( + aiplatform_models.gca_endpoint_compat.Endpoint(name=self._endpoint_name), + ) + return endpoint + + def list_deploy_options( + self, + concise: bool = False, + serving_container_image_uri_filter: Optional[Union[str, List[str]]] = None, + machine_type_filter: Optional[str] = None, + accelerator_type_filter: Optional[str] = None, + ) -> Union[str, Sequence[types.PublisherModel.CallToAction.Deploy]]: + """Lists the verified deploy options for the model. + + Args: + concise: If true, returns a human-readable string with container and + machine specs. + serving_container_image_uri_filter: If specified, only return the + deploy options where the serving container image URI contains one of + the specified keyword(s) (e.g., "vllm" or ["vllm", "tgi"]). The + filter is case-insensitive. + machine_type_filter: If specified, only return the deploy options + where the machine type contains one of the specified keyword(s) + (e.g., "n1" or ["n1", "g2"]). The filter is case-insensitive. + accelerator_type_filter: If specified, only return the deploy options + where the accelerator type contains one of the specified keyword(s) + (e.g., "T4" or ["T4", "L4"]). The filter is case-insensitive. + + Returns: + A list of deploy options or a concise formatted string. + """ + request = types.GetPublisherModelRequest( + name=self._publisher_model_name, + is_hugging_face_model=bool(self._is_hugging_face_model), + include_equivalent_model_garden_model_deployment_configs=True, + ) + response = self._us_central1_model_garden_client.get_publisher_model(request) + deploy_options = ( + response.supported_actions.multi_deploy_vertex.multi_deploy_vertex + ) + + if not deploy_options: + raise ValueError( + "Model does not support deployment. " + "Use `list_deployable_models()` to find supported models." + ) + + if serving_container_image_uri_filter: + if isinstance(serving_container_image_uri_filter, str): + serving_container_image_uri_filter = [ + serving_container_image_uri_filter + ] + serving_container_image_uri_filter = [ + f.lower() for f in serving_container_image_uri_filter + ] + deploy_options = [ + option + for option in deploy_options + if option.container_spec + and any( + f in option.container_spec.image_uri.lower() + for f in serving_container_image_uri_filter + ) + ] + + if machine_type_filter: + filters = ( + [machine_type_filter] + if isinstance(machine_type_filter, str) + else machine_type_filter + ) + deploy_options = [ + option + for option in deploy_options + if option.dedicated_resources + and option.dedicated_resources.machine_spec + and any( + f.lower() + in option.dedicated_resources.machine_spec.machine_type.lower() + for f in filters + ) + ] + + if accelerator_type_filter: + filters = ( + [accelerator_type_filter] + if isinstance(accelerator_type_filter, str) + else accelerator_type_filter + ) + deploy_options = [ + option + for option in deploy_options + if option.dedicated_resources + and option.dedicated_resources.machine_spec + and option.dedicated_resources.machine_spec.accelerator_type + and any( + f.lower() + in option.dedicated_resources.machine_spec.accelerator_type.name.lower() + for f in filters + ) + ] + + if not deploy_options: + raise ValueError("No deploy options found.") + + if not concise: + return deploy_options + + def _extract_config(option): + container = ( + option.container_spec.image_uri if option.container_spec else None + ) + machine = ( + option.dedicated_resources.machine_spec + if option.dedicated_resources + else None + ) + option_name = getattr(option, "deploy_task_name", None) + + return { + "option_name": option_name, + "serving_container_image_uri": container, + "machine_type": getattr(machine, "machine_type", None), + "accelerator_type": getattr( + getattr(machine, "accelerator_type", None), "name", None + ), + "accelerator_count": getattr(machine, "accelerator_count", None), + } + + concise_deploy_options = [_extract_config(opt) for opt in deploy_options] + return "\n\n".join( + ( + f"[Option {i + 1}: {config['option_name']}]\n" + if config.get("option_name") + else f"[Option {i + 1}]\n" + ) + + "\n".join( + f' {k}="{v}",' if k != "accelerator_count" else f" {k}={v}," + for k, v in config.items() + if v is not None and k != "option_name" + ) + for i, config in enumerate(concise_deploy_options) + ) + + def batch_predict( + self, + input_dataset: Union[str, List[str]], + *, + output_uri_prefix: Optional[str] = None, + job_display_name: Optional[str] = None, + machine_type: Optional[str] = None, + accelerator_type: Optional[str] = None, + accelerator_count: Optional[int] = None, + starting_replica_count: Optional[int] = None, + max_replica_count: Optional[int] = None, + ) -> batch_prediction.BatchPredictionJob: + """Perform batch prediction on the model. + + Args: + input_dataset (Union[str, List[str]]): GCS URI(-s) or BigQuery URI to + your input data to run batch prediction on. Example: + "gs://path/to/input/data.jsonl" or + "bq://projectId.bqDatasetId.bqTableId" + output_uri_prefix (Optional[str]): GCS or BigQuery URI prefix for the + output predictions. Example: "gs://path/to/output/data" or + "bq://projectId.bqDatasetId" If not specified, + f"{STAGING_BUCKET}/gen-ai-batch-prediction" will be used for GCS + source and + f"bq://projectId.gen_ai_batch_prediction.predictions_{TIMESTAMP}" will + be used for BigQuery source. + job_display_name (Optional[str]): The user-defined name of the + BatchPredictionJob. The name can be up to 128 characters long and can + be consist of any UTF-8 characters. + machine_type (Optional[str]): The machine type for the batch prediction + job. + accelerator_type (Optional[str]): The accelerator type for the batch + prediction job. + accelerator_count (Optional[int]): The accelerator count for the batch + prediction job. + starting_replica_count (Optional[int]): The starting replica count for + the batch prediction job. + max_replica_count (Optional[int]): The maximum replica count for the + batch prediction job. + + Returns: + batch_prediction.BatchPredictionJob: + The batch prediction job. + """ + return batch_prediction.BatchPredictionJob.submit( + source_model=self._publisher_model_name, + input_dataset=input_dataset, + output_uri_prefix=output_uri_prefix, + job_display_name=job_display_name, + machine_type=machine_type, + accelerator_type=accelerator_type, + accelerator_count=accelerator_count, + starting_replica_count=starting_replica_count, + max_replica_count=max_replica_count, + ) + + def check_license_agreement_status(self) -> bool: + """Check whether the project has accepted the license agreement of the model. + + EULA (End User License Agreement) is a legal document that the user must + accept before using the model. For Models having license restrictions, + the user must accept the EULA before using the model. You can check the + details of the License in Model Garden. + + Returns: + bool : True if the project has accepted the End User License + Agreement, False otherwise. + """ + request = types.CheckPublisherModelEulaAcceptanceRequest( + parent=f"projects/{self._project}", + publisher_model=self._publisher_model_name, + ) + response = self._model_garden_client.check_publisher_model_eula_acceptance( + request + ) + return response.publisher_model_eula_acked + + def accept_model_license_agreement( + self, + ) -> types.model_garden_service.PublisherModelEulaAcceptance: + """Accepts the EULA(End User License Agreement) of the model for the project. + + For Models having license restrictions, the user must accept the EULA + before using the model. Calling this method will mark the EULA as accepted + for the project. + + Returns: + types.model_garden_service.PublisherModelEulaAcceptance: + The response of the accept_eula call, containing project number, + model name and acceptance status. + """ + request = types.AcceptPublisherModelEulaRequest( + parent=f"projects/{self._project}", + publisher_model=self._publisher_model_name, + ) + return self._model_garden_client.accept_publisher_model_eula(request) + + +class CustomModel: + """Represents a Model Garden Custom model.""" + + def __init__( + self, + gcs_uri: Optional[str] = None, + ): + r"""Initializes a Model Garden Custom model. + + Usage: + + ``` + model = agentplatform.CustomModel( + gcs_uri = 'gs://tuning-job-output/node-0/checkpoints/final') + ``` + + Args: + gcs_uri: The GCS URI of the custom model, storing weights and config + files + """ + if not gcs_uri: + raise ValueError("gcs_uri must be specified.") + + project = initializer.global_config.project + location = initializer.global_config.location + credentials = initializer.global_config.credentials + + self._gcs_uri = gcs_uri + self._project = project + self._location = location + self._credentials = credentials + + @functools.cached_property + def _model_garden_client( + self, + ) -> model_garden_service.ModelGardenServiceClient: + """Returns the Model Garden client.""" + return initializer.global_config.create_client( + client_class=_ModelGardenClientWithOverride, + credentials=self._credentials, + location_override=self._location, + ) + + @functools.cached_property + def _model_service_client( + self, + ) -> model_service.ModelServiceClient: + """Returns the Model Service client.""" + return initializer.global_config.create_client( + client_class=_ModelServiceClientWithOverride, + credentials=self._credentials, + location_override=self._location, + ) + + def list_deploy_options( + self, + available_machines: bool = True, + filter_by_user_quota: bool = True, + request_timeout: Optional[float] = None, + ) -> str: + """Lists the deploy options for the model. + + Args: + available_machines: If true, only return the deploy options for + available machines. + filter_by_user_quota: If true, only return the deploy options for + machines that the user has quota for. + request_timeout: The timeout for the recommend spec request. + Default is 60 seconds. + + Returns: + str: A string of the deploy options represented by + machine spec and container spec. + + """ + + def _extract_spec(spec): + machine_spec = spec.machine_spec + return { + "machine_type": getattr(machine_spec, "machine_type", None), + "accelerator_type": getattr( + getattr(machine_spec, "accelerator_type", None), "name", None + ), + "accelerator_count": getattr(machine_spec, "accelerator_count", None), + } + + def _extract_recommendation(recommendation): + extracted_spec = _extract_spec(recommendation.spec) + extracted_spec["region"] = getattr(recommendation, "region", None) + if ( + recommendation.user_quota_state + and recommendation.user_quota_state + != types.RecommendSpecResponse.Recommendation.QuotaState.QUOTA_STATE_UNSPECIFIED + ): + extracted_spec["user_quota_state"] = getattr( + getattr(recommendation, "user_quota_state", None), "name", None + ) + return extracted_spec + + request = types.RecommendSpecRequest( + gcs_uri=self._gcs_uri, + parent=f"projects/{self._project}/locations/{self._location}", + check_machine_availability=available_machines, + check_user_quota=filter_by_user_quota, + ) + try: + response = self._model_service_client.recommend_spec( + request, timeout=request_timeout or _DEFAULT_RECOMMEND_SPEC_TIMEOUT + ) + options = [] + if response.recommendations: + options = [ + _extract_recommendation(recommendation) + for recommendation in response.recommendations + if recommendation.spec + ] + if filter_by_user_quota: + options = [ + option + for option in options + if option.get("user_quota_state") + == "QUOTA_STATE_USER_HAS_QUOTA" + ] + elif response.specs: + options = [_extract_spec(spec) for spec in response.specs if spec] + return "\n\n".join( + f"[Option {i + 1}]\n" + + ",\n".join( + f' {k}="{v}"' if k != "accelerator_count" else f" {k}={v}" + for k, v in config.items() + if v is not None + ) + for i, config in enumerate(options) + ) + + except Exception as e: + _LOGGER.error(f"Failed to list deploy options: {e}") + raise e + + def deploy( + self, + machine_type: Optional[str] = None, + min_replica_count: int = 1, + max_replica_count: int = 1, + accelerator_type: Optional[str] = None, + accelerator_count: Optional[int] = None, + reservation_affinity_type: Optional[str] = None, + reservation_affinity_key: Optional[str] = None, + reservation_affinity_values: Optional[List[str]] = None, + system_labels: Optional[Dict[str, str]] = None, + endpoint_display_name: Optional[str] = None, + model_display_name: Optional[str] = None, + enable_private_service_connect: bool = False, + psc_project_allow_list: Optional[List[str]] = None, + deploy_request_timeout: Optional[float] = None, + ) -> aiplatform.Endpoint: + """Deploys a Custom Model to an endpoint. + + Args: + machine_type (str): Optional. The type of machine. Not specifying + machine type will result in model to be deployed with automatic + resources. + min_replica_count (int): Optional. The minimum number of machine + replicas this deployed model will be always deployed on. If traffic + against it increases, it may dynamically be deployed onto more + replicas, and as traffic decreases, some of these extra replicas may + be freed. + max_replica_count (int): Optional. The maximum number of replicas this + deployed model may be deployed on when the traffic against it + increases. If requested value is too large, the deployment will error, + but if deployment succeeds then the ability to scale the model to that + many replicas is guaranteed (barring service outages). If traffic + against the deployed model increases beyond what its replicas at + maximum may handle, a portion of the traffic will be dropped. If this + value is not provided, the larger value of min_replica_count or 1 will + be used. If value provided is smaller than min_replica_count, it will + automatically be increased to be min_replica_count. + accelerator_type (str): Optional. Hardware accelerator type. Must also + set accelerator_count if used. + accelerator_count (int): Optional. The number of accelerators to attach + to a worker replica. + reservation_affinity_type (str): Optional. The type of reservation + affinity. One of NO_RESERVATION, ANY_RESERVATION, + SPECIFIC_RESERVATION, SPECIFIC_THEN_ANY_RESERVATION, + SPECIFIC_THEN_NO_RESERVATION + reservation_affinity_key (str): Optional. Corresponds to the label key + of a reservation resource. To target a SPECIFIC_RESERVATION by name, + use `compute.googleapis.com/reservation-name` as the key and specify + the name of your reservation as its value. + reservation_affinity_values (List[str]): Optional. Corresponds to the + label values of a reservation resource. This must be the full resource + name of the reservation. + Format: + 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}' + system_labels (Dict[str, str]): Optional. System labels for Model Garden deployments. + endpoint_display_name: The display name of the created endpoint. + model_display_name: The display name of the custom model. + enable_private_service_connect (bool): Whether to enable private service + connect. + psc_project_allow_list (List[str]): The list of projects that are allowed to + access the endpoint over private service connect. + deploy_request_timeout: The timeout for the deploy request. Default is 2 + hours. + + Returns: + endpoint (aiplatform.Endpoint): + Created endpoint. + """ + return self._deploy_gcs_uri( + machine_type=machine_type, + min_replica_count=min_replica_count, + max_replica_count=max_replica_count, + accelerator_type=accelerator_type, + accelerator_count=accelerator_count, + reservation_affinity_type=reservation_affinity_type, + reservation_affinity_key=reservation_affinity_key, + reservation_affinity_values=reservation_affinity_values, + system_labels=system_labels, + endpoint_display_name=endpoint_display_name, + model_display_name=model_display_name, + enable_private_service_connect=enable_private_service_connect, + psc_project_allow_list=psc_project_allow_list, + deploy_request_timeout=deploy_request_timeout, + ) + + def _deploy_model_registry_model(self) -> aiplatform.Endpoint: + """Deploys a Model Registry model to an endpoint.""" + raise NotImplementedError( + "Not implemented yet. Please provide gcs_uri in CustomModel constructor." + ) + + def _deploy_gcs_uri( + self, + machine_type: Optional[str] = None, + min_replica_count: int = 1, + max_replica_count: int = 1, + accelerator_type: Optional[str] = None, + accelerator_count: Optional[int] = None, + enable_private_service_connect: bool = False, + psc_project_allow_list: Optional[List[str]] = None, + reservation_affinity_type: Optional[str] = None, + reservation_affinity_key: Optional[str] = None, + reservation_affinity_values: Optional[List[str]] = None, + system_labels: Optional[Dict[str, str]] = None, + endpoint_display_name: Optional[str] = None, + model_display_name: Optional[str] = None, + deploy_request_timeout: Optional[float] = None, + ) -> aiplatform.Endpoint: + """Deploys a Custom Model to an endpoint. + + Args: + machine_type (str): Optional. The type of machine. Not specifying + machine type will result in model to be deployed with automatic + resources. + min_replica_count (int): Optional. The minimum number of machine + replicas this deployed model will be always deployed on. If traffic + against it increases, it may dynamically be deployed onto more + replicas, and as traffic decreases, some of these extra replicas may + be freed. + max_replica_count (int): Optional. The maximum number of replicas this + deployed model may be deployed on when the traffic against it + increases. If requested value is too large, the deployment will error, + but if deployment succeeds then the ability to scale the model to that + many replicas is guaranteed (barring service outages). If traffic + against the deployed model increases beyond what its replicas at + maximum may handle, a portion of the traffic will be dropped. If this + value is not provided, the larger value of min_replica_count or 1 will + be used. If value provided is smaller than min_replica_count, it will + automatically be increased to be min_replica_count. + accelerator_type (str): Optional. Hardware accelerator type. Must also + set accelerator_count if used. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, + NVIDIA_TESLA_P4, NVIDIA_TESLA_T4 + accelerator_count (int): Optional. The number of accelerators to attach + to a worker replica. + reservation_affinity_type (str): Optional. The type of reservation + affinity. One of NO_RESERVATION, ANY_RESERVATION, + SPECIFIC_RESERVATION, SPECIFIC_THEN_ANY_RESERVATION, + SPECIFIC_THEN_NO_RESERVATION + reservation_affinity_key (str): Optional. Corresponds to the label key + of a reservation resource. To target a SPECIFIC_RESERVATION by name, + use `compute.googleapis.com/reservation-name` as the key and specify + the name of your reservation as its value. + reservation_affinity_values (List[str]): Optional. Corresponds to the + label values of a reservation resource. This must be the full resource + name of the reservation. + Format: + 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}' + system_labels (Dict[str, str]): Optional. System labels for Model Garden deployments. + endpoint_display_name: The display name of the created endpoint. + model_display_name: The display name of the custom model. + enable_private_service_connect (bool): Whether to enable private service + connect. + psc_project_allow_list (List[str]): The list of projects that are allowed to + access the endpoint over private service connect. + deploy_request_timeout: The timeout for the deploy request. Default is 2 + hours. + + Returns: + endpoint (aiplatform.Endpoint): + Created endpoint. + """ + + # Validation on machine type, accelerator type and count. + # Return true if all three of them have value or are None. + def has_all_or_none_values(var1, var2, var3) -> bool: + return (var1 and var2 and var3) or (not var1 and not var2 and not var3) + + if not has_all_or_none_values( + machine_type, accelerator_type, accelerator_count + ): + raise ValueError( + "machine_type, accelerator_type and accelerator_count must all be provided or not provided." + ) + + request = types.DeployRequest( + destination=f"projects/{self._project}/locations/{self._location}", + ) + request.custom_model = types.DeployRequest.CustomModel(gcs_uri=self._gcs_uri) + if endpoint_display_name: + request.endpoint_config.endpoint_display_name = endpoint_display_name + if model_display_name: + request.model_config.model_display_name = model_display_name + if system_labels: + request.deploy_config.system_labels = system_labels + + if enable_private_service_connect and psc_project_allow_list: + request.endpoint_config.private_service_connect_config = ( + types.PrivateServiceConnectConfig( + enable_private_service_connect=enable_private_service_connect, + project_allowlist=psc_project_allow_list, + ) + ) + + if machine_type and accelerator_type and accelerator_count: + request.deploy_config.dedicated_resources = types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type=machine_type, + accelerator_type=accelerator_type, + accelerator_count=accelerator_count, + ) + ) + if min_replica_count: + request.deploy_config.dedicated_resources.min_replica_count = ( + min_replica_count + ) + if max_replica_count: + request.deploy_config.dedicated_resources.max_replica_count = ( + max_replica_count + ) + + if reservation_affinity_type: + request.deploy_config.dedicated_resources.machine_spec.reservation_affinity.reservation_affinity_type = ( + reservation_affinity_type + ) + if reservation_affinity_key and reservation_affinity_values: + request.deploy_config.dedicated_resources.machine_spec.reservation_affinity.key = ( + reservation_affinity_key + ) + request.deploy_config.dedicated_resources.machine_spec.reservation_affinity.values = ( + reservation_affinity_values + ) + + _LOGGER.info(f"Deploying custom model: {self._gcs_uri}") + + try: + operation_future = self._model_garden_client.deploy(request) + _LOGGER.info(f"LRO: {operation_future.operation.name}") + deploy_response = operation_future.result( + timeout=deploy_request_timeout or _DEFAULT_TIMEOUT + ) + _LOGGER.info(f"End time: {datetime.datetime.now()}") + self._endpoint_name = deploy_response.endpoint + _LOGGER.info(f"Endpoint: {self._endpoint_name}") + endpoint = aiplatform.Endpoint._construct_sdk_resource_from_gapic( + aiplatform_models.gca_endpoint_compat.Endpoint( + name=self._endpoint_name + ), + ) + return endpoint + except ValueError as e: + _LOGGER.error(f"Failed to deploy custom model: {e}") + raise e + + +class PartnerModel: + """Represents a Model Garden Partner model.""" + + def __init__( + self, + model_name: str, + ): + r"""Initializes a Model Garden partner model. + + Usage: + + ``` + model = PartnerModel("publishers/ai21/models/jamba-large-1.6@001") + ``` + + Args: + model_name: Model Garden model resource name in the format of + `publishers/{publisher}/models/{model}@{version}`, or a simplified + resource name in the format of `{publisher}/{model}@{version}`. + """ + project = initializer.global_config.project + location = initializer.global_config.location + credentials = initializer.global_config.credentials + + self._model_name = model_name + self._publisher_model_name = _reconcile_model_name(model_name) + self._project = project + self._location = location + self._credentials = credentials + + @functools.cached_property + def _model_garden_client( + self, + ) -> model_garden_service.ModelGardenServiceClient: + """Returns the Model Garden client.""" + return initializer.global_config.create_client( + client_class=_ModelGardenClientWithOverride, + credentials=self._credentials, + location_override=self._location, + ) + + def deploy( + self, + machine_type: Optional[str] = None, + min_replica_count: int = 1, + max_replica_count: int = 1, + accelerator_type: Optional[str] = None, + accelerator_count: Optional[int] = None, + endpoint_display_name: Optional[str] = None, + model_display_name: Optional[str] = None, + deploy_request_timeout: Optional[float] = None, + ) -> aiplatform.Endpoint: + """Deploys an Open Model to an endpoint. + + Args: + machine_type (str): Optional. The type of machine. Not specifying + machine type will result in model to be deployed with automatic + resources. + min_replica_count (int): Optional. The minimum number of machine + replicas this deployed model will be always deployed on. If traffic + against it increases, it may dynamically be deployed onto more + replicas, and as traffic decreases, some of these extra replicas may + be freed. + max_replica_count (int): Optional. The maximum number of replicas this + deployed model may be deployed on when the traffic against it + increases. If requested value is too large, the deployment will error, + but if deployment succeeds then the ability to scale the model to that + many replicas is guaranteed (barring service outages). If traffic + against the deployed model increases beyond what its replicas at + maximum may handle, a portion of the traffic will be dropped. If this + value is not provided, the larger value of min_replica_count or 1 will + be used. If value provided is smaller than min_replica_count, it will + automatically be increased to be min_replica_count. + accelerator_type (str): Optional. Hardware accelerator type. Must also + set accelerator_count if used. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, + NVIDIA_TESLA_P4, NVIDIA_TESLA_T4 + accelerator_count (int): Optional. The number of accelerators to attach + to a worker replica. + endpoint_display_name: The display name of the created endpoint. + model_display_name: The display name of the uploaded model. + deploy_request_timeout: The timeout for the deploy request. Default is 2 + hours. + + Returns: + endpoint (aiplatform.Endpoint): + Created endpoint. + """ + request = types.DeployRequest( + destination=f"projects/{self._project}/locations/{self._location}", + ) + request.publisher_model_name = self._publisher_model_name + + if endpoint_display_name: + request.endpoint_config.endpoint_display_name = endpoint_display_name + if model_display_name: + request.model_config.model_display_name = model_display_name + + provided_custom_machine_spec = ( + machine_type or accelerator_type or accelerator_count + ) + if provided_custom_machine_spec: + dedicated_resources = types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type=machine_type, + accelerator_type=accelerator_type, + accelerator_count=accelerator_count, + ), + min_replica_count=min_replica_count, + max_replica_count=max_replica_count, + ) + request.deploy_config.dedicated_resources = dedicated_resources + + _LOGGER.info(f"Deploying model: {self._model_name}") + + operation_future = self._model_garden_client.deploy(request) + _LOGGER.info(f"LRO: {operation_future.operation.name}") + + _LOGGER.info(f"Start time: {datetime.datetime.now()}") + deploy_response = operation_future.result( + timeout=deploy_request_timeout or _DEFAULT_TIMEOUT + ) + _LOGGER.info(f"End time: {datetime.datetime.now()}") + + self._endpoint_name = deploy_response.endpoint + _LOGGER.info(f"Endpoint: {self._endpoint_name}") + endpoint = aiplatform.Endpoint._construct_sdk_resource_from_gapic( + aiplatform_models.gca_endpoint_compat.Endpoint(name=self._endpoint_name), + ) + return endpoint + + +class Model: + """Represents a Model Garden model.""" + + def __init__( + self, + model_name: Optional[str] = None, + ): + r"""Initializes a Model Garden model. + + Usage: + + ``` + model = Model("publishers/google/models/gemma3@gemma-3-27b-it") + model = Model("google/gemma3@gemma-3-27b-it") + model = Model("deepseek-ai/DeepSeek-V3-0324") + model = Model("gs://fine-tuning-output/node-0/checkpoints/final") + model = Model("projects/123/locations/us-central1/models/456") + ``` + + Args: + model_name: Name of the model artifact. + + It can be: + 1. A pretrained model + 1.1 Model Garden model resource name in the format of + `publishers/{publisher}/models/{model}@{version}`, or + 1.2 a simplified resource name in the format of `{publisher}/{model}@{version}`, or + 1.3 a Hugging Face model ID in the format of `{organization}/{model}`. + 2. A custom model weights like gs://fine-tuning-output/node-0/checkpoints/final + 3. A Model Registry model like projects/123/locations/us-central1/models/456 + """ + if not model_name: + raise ValueError("model_name must be specified.") + + if re.match(r"^gs://", model_name): + self._model = CustomModel(gcs_uri=model_name) + elif re.match(r"^projects/.*/locations/.*/models/.*", model_name): + raise NotImplementedError("Model Registry models are not supported yet.") + else: + self._model = OpenModel(model_name) + + def deploy( + self, + **kwargs, + ) -> aiplatform.Endpoint: + """Deploys the model to an endpoint.""" + return self._model.deploy(**kwargs) diff --git a/agentplatform/preview/__init__.py b/agentplatform/preview/__init__.py new file mode 100644 index 0000000000..bb17a41dfd --- /dev/null +++ b/agentplatform/preview/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""The agentplatform.preview module.""" diff --git a/agentplatform/preview/model_garden.py b/agentplatform/preview/model_garden.py new file mode 100644 index 0000000000..870104ec73 --- /dev/null +++ b/agentplatform/preview/model_garden.py @@ -0,0 +1,31 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes and functions for working with Model Garden.""" + +# pylint: disable=g-multiple-import,g-importing-member +from agentplatform.model_garden._model_garden import ( + Model, + CustomModel, + OpenModel, + list_deployable_models, +) + + +__all__ = ( + "Model", + "CustomModel", + "OpenModel", + "list_deployable_models", +) diff --git a/tests/unit/vertexai/model_garden/test_model_garden.py b/tests/unit/agentplatform/model_garden/test_model_garden.py similarity index 99% rename from tests/unit/vertexai/model_garden/test_model_garden.py rename to tests/unit/agentplatform/model_garden/test_model_garden.py index 0fa7cb565e..23c18d095f 100644 --- a/tests/unit/vertexai/model_garden/test_model_garden.py +++ b/tests/unit/agentplatform/model_garden/test_model_garden.py @@ -1,4 +1,4 @@ -# Copyright 2025 Google LLC +# Copyright 2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,9 +33,9 @@ from google.cloud.aiplatform_v1.types import manual_batch_tuning_parameters from google.cloud.aiplatform_v1beta1 import types from google.cloud.aiplatform_v1beta1.services import model_garden_service -from vertexai import batch_prediction -from vertexai import model_garden -from vertexai.preview import ( +from agentplatform import batch_prediction +from agentplatform import model_garden +from agentplatform.preview import ( model_garden as model_garden_preview, ) import pytest diff --git a/tests/unit/vertexai/model_garden/test_vertexai_model_garden.py b/tests/unit/vertexai/model_garden/test_vertexai_model_garden.py new file mode 100644 index 0000000000..7150043f08 --- /dev/null +++ b/tests/unit/vertexai/model_garden/test_vertexai_model_garden.py @@ -0,0 +1,1999 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for ModelGarden class.""" + +import importlib +import textwrap +from unittest import mock + +from google import auth +from google.api_core import operation as ga_operation +from google.auth import credentials as auth_credentials +from google.cloud import aiplatform +from google.cloud.aiplatform.compat.services import job_service_client +from google.cloud.aiplatform.compat.types import ( + batch_prediction_job as gca_batch_prediction_job_compat, +) +from google.cloud.aiplatform.compat.types import io as gca_io_compat +from google.cloud.aiplatform.compat.types import ( + job_state as gca_job_state_compat, +) +from google.cloud.aiplatform_v1.types import machine_resources +from google.cloud.aiplatform_v1.types import manual_batch_tuning_parameters +from google.cloud.aiplatform_v1beta1 import types +from google.cloud.aiplatform_v1beta1.services import model_garden_service +from vertexai import batch_prediction +from vertexai import model_garden +from vertexai.preview import ( + model_garden as model_garden_preview, +) +import pytest + +from google.protobuf import duration_pb2 + + +_TEST_PROJECT = "test-project" +_TEST_LOCATION = "us-central1" +_TEST_PROJECT_NUMBER = "1234567890" + +_TEST_MODEL_FULL_RESOURCE_NAME = ( + "publishers/google/models/paligemma@paligemma-224-float32" +) +_TEST_HUGGING_FACE_MODEL_FULL_RESOURCE_NAME = ( + "publishers/meta-llama/models/llama-3.3-70b-instruct" +) +_TEST_PUBLISHER_MODEL_NAME = "publishers/google/models/paligemma" +_TEST_HUGGING_FACE_PUBLISHER_MODEL_NAME = "publishers/hf-google/models/gemma-2-2b" +_TEST_MODEL_SIMPLIFIED_RESOURCE_NAME = "google/paligemma@paligemma-224-float32" +_TEST_MODEL_HUGGING_FACE_ID = "meta-llama/Llama-3.3-70B-Instruct" +_TEST_MODEL_HUGGING_FACE_RESOURCE_NAME = ( + "publishers/hf-meta-llama/models/llama-3.3-70b-instruct" +) +# Note: The full resource name is in lower case. +_TEST_MODEL_HUGGING_FACE_FULL_RESOURCE_NAME = ( + "publishers/hf-meta-llama/models/llama-3.3-70b-instruct@001" +) +_TEST_HUGGING_FACE_ACCESS_TOKEN = "test-access-token" + +_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME = "publishers/ai21/models/jamba-large-1.6@001" +_TEST_PARTNER_MODEL_SIMPLIFIED_RESOURCE_NAME = "ai21/jamba-large-1.6@001" + +_TEST_GCS_URI = "gs://some-bucket/some-model" +_TEST_ENDPOINT_NAME = "projects/test-project/locations/us-central1/endpoints/1234567890" +_TEST_MODEL_NAME = "projects/test-project/locations/us-central1/models/9876543210" +_TEST_IMAGE_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241202_0916_RC00" +_TEST_MODEL_CONTAINER_SPEC = types.ModelContainerSpec( + image_uri=_TEST_IMAGE_URI, + command=["python", "main.py"], + args=["--model-id=gemma-2b"], + env=[types.EnvVar(name="MODEL_ID", value="gemma-2b")], + ports=[types.Port(container_port=7080)], + grpc_ports=[types.Port(container_port=7081)], + predict_route="/predictions/v1/predict", + health_route="/ping", + deployment_timeout=duration_pb2.Duration(seconds=1800), + shared_memory_size_mb=256, + startup_probe=types.Probe( + exec_=types.Probe.ExecAction(command=["python", "main.py"]), + period_seconds=10, + timeout_seconds=10, + ), + health_probe=types.Probe( + exec_=types.Probe.ExecAction(command=["python", "health_check.py"]), + period_seconds=10, + timeout_seconds=10, + ), +) +_TEST_BATCH_PREDICTION_JOB_ID = "123456789" +_TEST_PARENT = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}" +_TEST_BATCH_PREDICTION_JOB_NAME = ( + f"{_TEST_PARENT}/batchPredictionJobs/{_TEST_BATCH_PREDICTION_JOB_ID}" +) +_TEST_BATCH_PREDICTION_MODEL_FULL_RESOURCE_NAME = ( + "publishers/google/models/gemma@gemma-2b-it" +) +_TEST_BATCH_PREDICTION_JOB_DISPLAY_NAME = "test-batch-prediction-job" +_TEST_JOB_STATE_RUNNING = gca_job_state_compat.JobState(3) +_TEST_GAPIC_BATCH_PREDICTION_JOB = gca_batch_prediction_job_compat.BatchPredictionJob( + name=_TEST_BATCH_PREDICTION_JOB_NAME, + display_name=_TEST_BATCH_PREDICTION_JOB_DISPLAY_NAME, + model=_TEST_BATCH_PREDICTION_MODEL_FULL_RESOURCE_NAME, + state=_TEST_JOB_STATE_RUNNING, +) +_TEST_BQ_INPUT_URI = "bq://test-project.test-dataset.test-input" +_TEST_BQ_OUTPUT_PREFIX = "bq://test-project.test-dataset.test-output" + + +@pytest.fixture(scope="module") +def google_auth_mock(): + with mock.patch.object(auth, "default") as google_auth_mock: + google_auth_mock.return_value = ( + auth_credentials.AnonymousCredentials(), + _TEST_PROJECT, + ) + yield google_auth_mock + + +@pytest.fixture +def export_publisher_model_mock(): + """Mocks the export_publisher_model method.""" + with mock.patch.object( + model_garden_service.ModelGardenServiceClient, + "export_publisher_model", + ) as export_publisher_model: + mock_export_lro = mock.Mock(ga_operation.Operation) + mock_export_lro.result.return_value = types.ExportPublisherModelResponse( + publisher_model=_TEST_MODEL_FULL_RESOURCE_NAME, + destination_uri=_TEST_GCS_URI, + ) + export_publisher_model.return_value = mock_export_lro + yield export_publisher_model + + +@pytest.fixture +def deploy_mock(): + """Mocks the deploy method.""" + with mock.patch.object( + model_garden_service.ModelGardenServiceClient, + "deploy", + ) as deploy: + mock_lro = mock.Mock(ga_operation.Operation) + mock_lro.result.return_value = types.DeployResponse( + endpoint=_TEST_ENDPOINT_NAME, + model=_TEST_MODEL_FULL_RESOURCE_NAME, + ) + deploy.return_value = mock_lro + yield deploy + + +@pytest.fixture +def batch_prediction_mock(): + """Mocks the create_batch_prediction_job method.""" + with mock.patch.object( + job_service_client.JobServiceClient, "create_batch_prediction_job" + ) as create_batch_prediction_job_mock: + create_batch_prediction_job_mock.return_value = _TEST_GAPIC_BATCH_PREDICTION_JOB + yield create_batch_prediction_job_mock + + +@pytest.fixture +def complete_bq_uri_mock(): + with mock.patch.object( + batch_prediction.BatchPredictionJob, "_complete_bq_uri" + ) as complete_bq_uri_mock: + complete_bq_uri_mock.return_value = _TEST_BQ_OUTPUT_PREFIX + yield complete_bq_uri_mock + + +@pytest.fixture +def get_publisher_model_mock(): + with mock.patch.object( + model_garden_service.ModelGardenServiceClient, "get_publisher_model" + ) as get_publisher_model_mock: + error_response = types.PublisherModel(name=_TEST_PUBLISHER_MODEL_NAME) + success_response = types.PublisherModel( + name=_TEST_PUBLISHER_MODEL_NAME, + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + deploy_task_name="vLLM 32K context", + container_spec=types.ModelContainerSpec( + image_uri=_TEST_IMAGE_URI, + command=["python", "main.py"], + args=["--model-id=gemma-2b"], + env=[types.EnvVar(name="MODEL_ID", value="gemma-2b")], + ), + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ), + ), + types.PublisherModel.CallToAction.Deploy( + deploy_task_name="vLLM 128K context", + container_spec=types.ModelContainerSpec( + image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/text-generation-inference-cu121.2-1.py310:latest", + command=["python", "main.py"], + args=["--model-id=gemma-2b"], + env=[types.EnvVar(name="MODEL_ID", value="gemma-2b")], + ), + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-32", + accelerator_type="NVIDIA_L4", + accelerator_count=4, + ) + ), + ), + ] + ) + ), + ) + hf_success_response = types.PublisherModel( + name=_TEST_MODEL_HUGGING_FACE_RESOURCE_NAME, + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + container_spec=types.ModelContainerSpec( + image_uri=_TEST_IMAGE_URI, + command=["python", "main.py"], + args=["--model-id=gemma-2b"], + env=[types.EnvVar(name="MODEL_ID", value="gemma-2b")], + ), + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ), + ), + types.PublisherModel.CallToAction.Deploy( + container_spec=types.ModelContainerSpec( + image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/text-generation-inference-cu121.2-1.py310:latest", + command=["python", "main.py"], + args=["--model-id=gemma-2b"], + env=[types.EnvVar(name="MODEL_ID", value="gemma-2b")], + ), + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-32", + accelerator_type="NVIDIA_L4", + accelerator_count=4, + ) + ), + ), + ] + ) + ), + ) + + call_counts = {} + + def side_effect_func(request, *args, **kwargs): + model_name = request.name + if model_name not in call_counts: + call_counts[model_name] = 0 + + call_counts[model_name] += 1 + + if model_name == _TEST_HUGGING_FACE_MODEL_FULL_RESOURCE_NAME: + return hf_success_response + + if call_counts[model_name] == 1: + return error_response + else: + return success_response + + get_publisher_model_mock.side_effect = side_effect_func + yield get_publisher_model_mock + + +@pytest.fixture +def list_publisher_models_mock(): + """Mocks the list_publisher_models method.""" + with mock.patch.object( + model_garden_service.ModelGardenServiceClient, + "list_publisher_models", + ) as list_publisher_models: + pager_mg = mock.Mock() + pager_mg.pages = [ + types.ListPublisherModelsResponse( + publisher_models=[ + types.PublisherModel( + name=_TEST_PUBLISHER_MODEL_NAME, + version_id="001", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + types.PublisherModel( + name=_TEST_PUBLISHER_MODEL_NAME, + version_id="002", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + ], + ), + types.ListPublisherModelsResponse( + publisher_models=[ + types.PublisherModel( + name=_TEST_PUBLISHER_MODEL_NAME, + version_id="003", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + types.PublisherModel( + name=_TEST_PUBLISHER_MODEL_NAME, + version_id="004", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + ], + ), + ] + pager_hf = mock.Mock() + pager_hf.pages = [ + types.ListPublisherModelsResponse( + publisher_models=[ + types.PublisherModel( + name=_TEST_HUGGING_FACE_PUBLISHER_MODEL_NAME, + version_id="001", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + types.PublisherModel( + name=_TEST_HUGGING_FACE_PUBLISHER_MODEL_NAME, + version_id="002", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + ], + ), + types.ListPublisherModelsResponse( + publisher_models=[ + types.PublisherModel( + name=_TEST_HUGGING_FACE_PUBLISHER_MODEL_NAME, + version_id="003", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + types.PublisherModel( + name=_TEST_HUGGING_FACE_PUBLISHER_MODEL_NAME, + version_id="004", + supported_actions=types.PublisherModel.CallToAction( + multi_deploy_vertex=types.PublisherModel.CallToAction.DeployVertex( + multi_deploy_vertex=[ + types.PublisherModel.CallToAction.Deploy( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ) + ) + ) + ] + ) + ), + ), + ], + ), + ] + list_publisher_models.side_effect = [pager_mg, pager_hf] + yield list_publisher_models + + +@pytest.fixture +def check_license_agreement_status_mock(): + """Mocks the check_license_agreement_status method.""" + with mock.patch.object( + model_garden_service.ModelGardenServiceClient, + "check_publisher_model_eula_acceptance", + ) as check_license_agreement_status: + check_license_agreement_status.return_value = ( + types.PublisherModelEulaAcceptance( + project_number=_TEST_PROJECT_NUMBER, + publisher_model=_TEST_MODEL_FULL_RESOURCE_NAME, + publisher_model_eula_acked=True, + ) + ) + yield check_license_agreement_status + + +@pytest.fixture +def accept_model_license_agreement_mock(): + """Mocks the accept_model_license_agreement method.""" + with mock.patch.object( + model_garden_service.ModelGardenServiceClient, + "accept_publisher_model_eula", + ) as accept_model_license_agreement: + accept_model_license_agreement.return_value = ( + types.PublisherModelEulaAcceptance( + project_number=_TEST_PROJECT_NUMBER, + publisher_model=_TEST_MODEL_FULL_RESOURCE_NAME, + publisher_model_eula_acked=True, + ) + ) + yield accept_model_license_agreement + + +@pytest.mark.usefixtures( + "google_auth_mock", + "deploy_mock", +) +class TestVertexAIModelGardenPartnerModel: + """Test cases for Model Garden PartnerModel class.""" + + def setup_method(self): + importlib.reload(aiplatform.initializer) + importlib.reload(aiplatform) + aiplatform.init(project=_TEST_PROJECT) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + def test_deploy_full_resource_name_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.PartnerModel( + model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME + ) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_simplified_resource_name_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.PartnerModel( + model_name=_TEST_PARTNER_MODEL_SIMPLIFIED_RESOURCE_NAME + ) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_specify_machine_spec_success(self, deploy_mock): + """Tests deploying a model with specified machine spec.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.PartnerModel( + model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME + ) + model.deploy( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + min_replica_count=1, + max_replica_count=1, + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + ), + min_replica_count=1, + max_replica_count=1, + ) + ), + ) + ) + + def test_deploy_specify_partial_machine_spec_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.PartnerModel( + model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME + ) + model.deploy( + accelerator_type="NVIDIA_TESLA_T4", + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + accelerator_type="NVIDIA_TESLA_T4", + ), + min_replica_count=1, + max_replica_count=1, + ) + ), + ) + ) + + def test_deploy_with_timeout_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.PartnerModel( + model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME + ) + model.deploy(deploy_request_timeout=10) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ), + ) + + def test_deploy_with_display_names_success(self, deploy_mock): + """Tests deploying a model with display names.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.PartnerModel( + model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME + ) + model.deploy( + endpoint_display_name="test-endpoint", + model_display_name="test-model", + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_PARTNER_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + model_config=types.DeployRequest.ModelConfig( + model_display_name="test-model", + ), + endpoint_config=types.DeployRequest.EndpointConfig( + endpoint_display_name="test-endpoint", + ), + ) + ) + + +@pytest.mark.usefixtures( + "google_auth_mock", + "deploy_mock", + "get_publisher_model_mock", + "list_publisher_models_mock", + "export_publisher_model_mock", + "batch_prediction_mock", + "complete_bq_uri_mock", + "check_license_agreement_status_mock", + "accept_model_license_agreement_mock", +) +class TestVertexAIModelGardenOpenModel: + """Test cases for Model Garden OpenModel class.""" + + def setup_method(self): + importlib.reload(aiplatform.initializer) + importlib.reload(aiplatform) + aiplatform.init(project=_TEST_PROJECT) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + def test_export_full_resource_name_success(self, export_publisher_model_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.export(_TEST_GCS_URI) + export_publisher_model_mock.assert_called_once_with( + types.ExportPublisherModelRequest( + parent=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=types.GcsDestination(output_uri_prefix=_TEST_GCS_URI), + ), + metadata=[("x-goog-user-project", "test-project")], + ) + + def test_export_simplified_resource_name_success(self, export_publisher_model_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_SIMPLIFIED_RESOURCE_NAME) + model.export(_TEST_GCS_URI) + export_publisher_model_mock.assert_called_once_with( + types.ExportPublisherModelRequest( + parent=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=types.GcsDestination(output_uri_prefix=_TEST_GCS_URI), + ), + metadata=[("x-goog-user-project", "test-project")], + ) + + def test_export_hugging_face_id_success(self, export_publisher_model_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_HUGGING_FACE_ID) + model.export(_TEST_GCS_URI) + export_publisher_model_mock.assert_called_once_with( + types.ExportPublisherModelRequest( + parent=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + name=_TEST_HUGGING_FACE_MODEL_FULL_RESOURCE_NAME, + destination=types.GcsDestination(output_uri_prefix=_TEST_GCS_URI), + ), + metadata=[("x-goog-user-project", "test-project")], + ) + + def test_deploy_full_resource_name_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_simplified_resource_name_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_SIMPLIFIED_RESOURCE_NAME) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_hugging_face_id_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_HUGGING_FACE_ID) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + hugging_face_model_id=_TEST_MODEL_HUGGING_FACE_ID.lower(), + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_specify_machine_spec_success(self, deploy_mock): + """Tests deploying a model with specified machine spec.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + min_replica_count=1, + max_replica_count=1, + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + ), + min_replica_count=1, + max_replica_count=1, + ) + ), + ) + ) + + def test_deploy_specify_partial_machine_spec_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + accelerator_type="NVIDIA_TESLA_T4", + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + accelerator_type="NVIDIA_TESLA_T4", + ), + min_replica_count=1, + max_replica_count=1, + ) + ), + ) + ) + + def test_deploy_with_timeout_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy(deploy_request_timeout=10) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ), + ) + + def test_deploy_with_display_names_success(self, deploy_mock): + """Tests deploying a model with display names.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + endpoint_display_name="test-endpoint", + model_display_name="test-model", + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + model_config=types.DeployRequest.ModelConfig( + model_display_name="test-model", + ), + endpoint_config=types.DeployRequest.EndpointConfig( + endpoint_display_name="test-endpoint", + ), + ) + ) + + def test_deploy_with_eula_success(self, deploy_mock): + """Tests deploying a model with EULA.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy(accept_eula=True) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + model_config=types.DeployRequest.ModelConfig( + accept_eula=True, + ), + ) + ) + + def test_deploy_with_hugging_face_access_token_success(self, deploy_mock): + """Tests deploying a model with Hugging Face access token.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_HUGGING_FACE_ID) + model.deploy(hugging_face_access_token=_TEST_HUGGING_FACE_ACCESS_TOKEN) + deploy_mock.assert_called_once_with( + types.DeployRequest( + hugging_face_model_id=_TEST_MODEL_HUGGING_FACE_ID.lower(), + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + model_config=types.DeployRequest.ModelConfig( + hugging_face_access_token=_TEST_HUGGING_FACE_ACCESS_TOKEN, + ), + ) + ) + + def test_deploy_with_spot_vm_success(self, deploy_mock): + """Tests deploying a model with spot VM.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy(spot=True) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources(spot=True), + ), + ) + ) + + def test_deploy_with_reservation_success(self, deploy_mock): + """Tests deploying a model with spot VM.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + reservation_affinity_type="SPECIFIC_RESERVATION", + reservation_affinity_key="compute.googleapis.com/reservation-name", + reservation_affinity_values=[ + "projects/test-project/zones/us-central1-a/reservations/test-reservation" + ], + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + machine_spec=types.MachineSpec( + reservation_affinity=types.ReservationAffinity( + reservation_affinity_type="SPECIFIC_RESERVATION", + key="compute.googleapis.com/reservation-name", + values=[ + "projects/test-project/zones/us-central1-a/reservations/test-reservation" + ], + ) + ) + ) + ), + ) + ) + + def test_deploy_with_dedicated_endpoint_success(self, deploy_mock): + """Tests deploying a model with dedicated endpoint.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy(dedicated_endpoint_disabled=True) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + endpoint_config=types.DeployRequest.EndpointConfig( + dedicated_endpoint_disabled=True + ), + ) + ) + + def test_deploy_with_system_labels_success(self, deploy_mock): + """Tests deploying a model with system labels.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy(system_labels={"test-key": "test-value"}) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + system_labels={"test-key": "test-value"} + ), + ) + ) + + def test_deploy_with_fast_tryout_enabled_success(self, deploy_mock): + """Tests deploying a model with fast tryout enabled.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy(fast_tryout_enabled=True) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + deploy_config=types.DeployRequest.DeployConfig( + fast_tryout_enabled=True + ), + ) + ) + + def test_deploy_with_serving_container_image_success(self, deploy_mock): + """Tests deploying a model with serving container spec.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + serving_container_image_uri=_TEST_IMAGE_URI, + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + model_config=types.DeployRequest.ModelConfig( + container_spec=types.ModelContainerSpec( + image_uri=_TEST_IMAGE_URI, + ) + ), + ) + ) + + def test_deploy_with_serving_container_spec_success(self, deploy_mock): + """Tests deploying a model with serving container spec.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy(serving_container_spec=_TEST_MODEL_CONTAINER_SPEC) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + model_config=types.DeployRequest.ModelConfig( + container_spec=_TEST_MODEL_CONTAINER_SPEC + ), + ) + ) + + def test_deploy_with_serving_container_spec_no_image_uri_raises_error(self): + """Tests getting the supported deploy options for a model.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + expected_message = ( + "Serving container image uri is required for the serving container" " spec." + ) + with pytest.raises(ValueError) as exception: + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + serving_container_spec=types.ModelContainerSpec( + predict_route="/predictions/v1/predict", + health_route="/ping", + ) + ) + assert str(exception.value) == expected_message + + def test_deploy_with_serving_container_spec_with_both_image_uri_raises_error( + self, + ): + """Tests getting the supported deploy options for a model.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + expected_message = ( + "Serving container image uri is already set in the serving container" + " spec." + ) + with pytest.raises(ValueError) as exception: + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + serving_container_spec=types.ModelContainerSpec( + image_uri=_TEST_IMAGE_URI, + predict_route="/predictions/v1/predict", + health_route="/ping", + ), + serving_container_image_uri=_TEST_IMAGE_URI, + ) + assert str(exception.value) == expected_message + + def test_deploy_with_serving_container_spec_individual_fields_success( + self, deploy_mock + ): + """Tests deploying a model with serving container spec.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + serving_container_image_uri=_TEST_IMAGE_URI, + serving_container_predict_route="/predictions/v1/predict", + serving_container_health_route="/ping", + serving_container_command=["python", "main.py"], + serving_container_args=["--model-id=gemma-2b"], + serving_container_environment_variables={"MODEL_ID": "gemma-2b"}, + serving_container_ports=[7080], + serving_container_grpc_ports=[7081], + serving_container_deployment_timeout=1800, + serving_container_shared_memory_size_mb=256, + serving_container_startup_probe_exec=["python", "main.py"], + serving_container_startup_probe_period_seconds=10, + serving_container_startup_probe_timeout_seconds=10, + serving_container_health_probe_exec=["python", "health_check.py"], + serving_container_health_probe_period_seconds=10, + serving_container_health_probe_timeout_seconds=10, + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + model_config=types.DeployRequest.ModelConfig( + container_spec=types.ModelContainerSpec( + image_uri=_TEST_IMAGE_URI, + command=["python", "main.py"], + args=["--model-id=gemma-2b"], + env=[types.EnvVar(name="MODEL_ID", value="gemma-2b")], + ports=[types.Port(container_port=7080)], + grpc_ports=[types.Port(container_port=7081)], + predict_route="/predictions/v1/predict", + health_route="/ping", + deployment_timeout=duration_pb2.Duration(seconds=1800), + shared_memory_size_mb=256, + startup_probe=types.Probe( + exec_=types.Probe.ExecAction(command=["python", "main.py"]), + period_seconds=10, + timeout_seconds=10, + ), + health_probe=types.Probe( + exec_=types.Probe.ExecAction( + command=["python", "health_check.py"] + ), + period_seconds=10, + timeout_seconds=10, + ), + ) + ), + ) + ) + + def test_list_deploy_options(self, get_publisher_model_mock): + """Tests getting the supported deploy options for a model.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + expected_message = ( + "Model does not support deployment. " + "Use `list_deployable_models()` to find supported models." + ) + with pytest.raises(ValueError) as exception: + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + _ = model.list_deploy_options() + assert str(exception.value) == expected_message + + model.list_deploy_options() + get_publisher_model_mock.assert_called_with( + types.GetPublisherModelRequest( + name=_TEST_MODEL_FULL_RESOURCE_NAME, + is_hugging_face_model=False, + include_equivalent_model_garden_model_deployment_configs=True, + ) + ) + + hf_model = model_garden.OpenModel(_TEST_MODEL_HUGGING_FACE_ID) + hf_model.list_deploy_options() + get_publisher_model_mock.assert_called_with( + types.GetPublisherModelRequest( + name=_TEST_HUGGING_FACE_MODEL_FULL_RESOURCE_NAME, + is_hugging_face_model=True, + include_equivalent_model_garden_model_deployment_configs=True, + ) + ) + + def test_list_deploy_options_concise(self, get_publisher_model_mock): + """Tests getting the supported deploy options for a model.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + expected_message = ( + "Model does not support deployment. " + "Use `list_deployable_models()` to find supported models." + ) + with pytest.raises(ValueError) as exception: + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + _ = model.list_deploy_options(concise=True) + assert str(exception.value) == expected_message + + result = model.list_deploy_options(concise=True) + expected_result = textwrap.dedent( + """\ + [Option 1: vLLM 32K context] + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241202_0916_RC00", + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + + [Option 2: vLLM 128K context] + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/text-generation-inference-cu121.2-1.py310:latest", + machine_type="g2-standard-32", + accelerator_type="NVIDIA_L4", + accelerator_count=4,""" + ) + assert result == expected_result + get_publisher_model_mock.assert_called_with( + types.GetPublisherModelRequest( + name=_TEST_MODEL_FULL_RESOURCE_NAME, + is_hugging_face_model=False, + include_equivalent_model_garden_model_deployment_configs=True, + ) + ) + + hf_model = model_garden.OpenModel(_TEST_MODEL_HUGGING_FACE_ID) + hf_result = hf_model.list_deploy_options(concise=True) + expected_hf_result = textwrap.dedent( + """\ + [Option 1] + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241202_0916_RC00", + machine_type="g2-standard-16", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + + [Option 2] + serving_container_image_uri="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/text-generation-inference-cu121.2-1.py310:latest", + machine_type="g2-standard-32", + accelerator_type="NVIDIA_L4", + accelerator_count=4,""" + ) + assert hf_result == expected_hf_result + get_publisher_model_mock.assert_called_with( + types.GetPublisherModelRequest( + name=_TEST_HUGGING_FACE_MODEL_FULL_RESOURCE_NAME, + is_hugging_face_model=True, + include_equivalent_model_garden_model_deployment_configs=True, + ) + ) + + def test_list_deploy_options_with_filters(self, get_publisher_model_mock): + """Tests getting the supported deploy options for a model with filters.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + + expected_message = ( + "Model does not support deployment. " + "Use `list_deployable_models()` to find supported models." + ) + with pytest.raises(ValueError) as exception: + _ = model.list_deploy_options() + assert str(exception.value) == expected_message + + # Test serving_container_image_uri_filter + result = model.list_deploy_options(serving_container_image_uri_filter="vllm") + assert len(result) == 1 + assert "vllm" in result[0].container_spec.image_uri + + # Test case-insensitivity for serving_container_image_uri_filter + result = model.list_deploy_options(serving_container_image_uri_filter="VLLM") + assert len(result) == 1 + assert "vllm" in result[0].container_spec.image_uri + + # Test list of strings for serving_container_image_uri_filter + result = model.list_deploy_options( + serving_container_image_uri_filter=["vllm", "text-generation-inference"] + ) + assert len(result) == 2 + + # Test machine_type_filter + result = model.list_deploy_options(machine_type_filter="g2-standard-16") + assert len(result) == 1 + assert ( + "g2-standard-16" == result[0].dedicated_resources.machine_spec.machine_type + ) + + # Test case-insensitivity for machine_type_filter + result = model.list_deploy_options(machine_type_filter="G2-STANDARD-16") + assert len(result) == 1 + assert ( + "g2-standard-16" == result[0].dedicated_resources.machine_spec.machine_type + ) + + # Test accelerator_type_filter + result = model.list_deploy_options(accelerator_type_filter="L4") + assert len(result) == 2 + + # Test case-insensitivity for accelerator_type_filter + result = model.list_deploy_options(accelerator_type_filter="l4") + assert len(result) == 2 + + # Test combination of filters + result = model.list_deploy_options( + serving_container_image_uri_filter="vllm", + machine_type_filter="g2-standard-16", + accelerator_type_filter="L4", + ) + assert len(result) == 1 + + # Test with no match + with pytest.raises(ValueError): + model.list_deploy_options(machine_type_filter="non-existent") + + def test_list_deployable_models(self, list_publisher_models_mock): + """Tests getting the supported deploy options for a model.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + mg_models = model_garden.list_deployable_models() + list_publisher_models_mock.assert_called_with( + types.ListPublisherModelsRequest( + parent="publishers/*", + list_all_versions=True, + filter="is_hf_wildcard(false)", + ) + ) + + assert mg_models == [ + "google/paligemma@001", + "google/paligemma@002", + "google/paligemma@003", + "google/paligemma@004", + ] + + hf_models = model_garden.list_deployable_models(list_hf_models=True) + list_publisher_models_mock.assert_called_with( + types.ListPublisherModelsRequest( + parent="publishers/*", + list_all_versions=True, + filter=( + "is_hf_wildcard(true) AND " + "labels.VERIFIED_DEPLOYMENT_CONFIG=VERIFIED_DEPLOYMENT_SUCCEED" + ), + ) + ) + assert hf_models == [ + "google/gemma-2-2b", + "google/gemma-2-2b", + "google/gemma-2-2b", + "google/gemma-2-2b", + ] + + def test_list_models(self, list_publisher_models_mock): + """Tests listing models.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + mg_models = model_garden.list_models() + list_publisher_models_mock.assert_called_with( + types.ListPublisherModelsRequest( + parent="publishers/*", + list_all_versions=True, + filter="is_hf_wildcard(false)", + ) + ) + + assert mg_models == [ + "google/paligemma@001", + "google/paligemma@002", + "google/paligemma@003", + "google/paligemma@004", + ] + + hf_models = model_garden.list_models(list_hf_models=True) + list_publisher_models_mock.assert_called_with( + types.ListPublisherModelsRequest( + parent="publishers/*", + list_all_versions=True, + filter="is_hf_wildcard(true)", + ) + ) + assert hf_models == [ + "google/gemma-2-2b", + "google/gemma-2-2b", + "google/gemma-2-2b", + "google/gemma-2-2b", + ] + + def test_batch_prediction_success(self, batch_prediction_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel( + model_name=_TEST_BATCH_PREDICTION_MODEL_FULL_RESOURCE_NAME + ) + job = model.batch_predict( + input_dataset=_TEST_BQ_INPUT_URI, + job_display_name=_TEST_BATCH_PREDICTION_JOB_DISPLAY_NAME, + machine_type="g2-standard-12", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + starting_replica_count=1, + ) + + assert job.gca_resource == _TEST_GAPIC_BATCH_PREDICTION_JOB + + expected_gapic_batch_prediction_job = gca_batch_prediction_job_compat.BatchPredictionJob( + display_name=_TEST_BATCH_PREDICTION_JOB_DISPLAY_NAME, + model=_TEST_BATCH_PREDICTION_MODEL_FULL_RESOURCE_NAME, + input_config=gca_batch_prediction_job_compat.BatchPredictionJob.InputConfig( + instances_format="bigquery", + bigquery_source=gca_io_compat.BigQuerySource( + input_uri=_TEST_BQ_INPUT_URI + ), + ), + output_config=gca_batch_prediction_job_compat.BatchPredictionJob.OutputConfig( + bigquery_destination=gca_io_compat.BigQueryDestination( + output_uri=_TEST_BQ_OUTPUT_PREFIX + ), + predictions_format="bigquery", + ), + dedicated_resources=machine_resources.BatchDedicatedResources( + machine_spec=machine_resources.MachineSpec( + machine_type="g2-standard-12", + accelerator_type="NVIDIA_L4", + accelerator_count=1, + ), + starting_replica_count=1, + ), + manual_batch_tuning_parameters=manual_batch_tuning_parameters.ManualBatchTuningParameters(), + ) + + batch_prediction_mock.assert_called_once_with( + parent=_TEST_PARENT, + batch_prediction_job=expected_gapic_batch_prediction_job, + timeout=None, + ) + + def test_deploy_with_psc_success(self, deploy_mock): + """Tests deploying a model with Private Service Connect.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy( + enable_private_service_connect=True, + psc_project_allow_list=["project-1", "project-2"], + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + endpoint_config=types.DeployRequest.EndpointConfig( + private_service_connect_config=types.PrivateServiceConnectConfig( + enable_private_service_connect=True, + project_allowlist=["project-1", "project-2"], + ) + ), + ) + ) + + def test_check_license_agreement_status_success( + self, check_license_agreement_status_mock + ): + """Tests checking EULA acceptance for a model.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + eula_acceptance = model.check_license_agreement_status() + check_license_agreement_status_mock.assert_called_once_with( + types.CheckPublisherModelEulaAcceptanceRequest( + parent=f"projects/{_TEST_PROJECT}", + publisher_model=_TEST_MODEL_FULL_RESOURCE_NAME, + ) + ) + assert eula_acceptance + + def test_accept_model_license_agreement_success( + self, accept_model_license_agreement_mock + ): + """Tests accepting EULA for a model.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden.OpenModel(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + eula_acceptance = model.accept_model_license_agreement() + accept_model_license_agreement_mock.assert_called_once_with( + types.AcceptPublisherModelEulaRequest( + parent=f"projects/{_TEST_PROJECT}", + publisher_model=_TEST_MODEL_FULL_RESOURCE_NAME, + ) + ) + assert eula_acceptance == types.PublisherModelEulaAcceptance( + project_number=_TEST_PROJECT_NUMBER, + publisher_model=_TEST_MODEL_FULL_RESOURCE_NAME, + publisher_model_eula_acked=True, + ) + + +pytest.mark.usefixtures( + "google_auth_mock", + "deploy_mock", +) + + +class TestVertexAIModelGardenCustomModel: + """Test cases for ModelGarden class.""" + + def setup_method(self): + importlib.reload(aiplatform.initializer) + importlib.reload(aiplatform) + aiplatform.init(project=_TEST_PROJECT) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + def test_deploy_custom_model_gcs_uri_only_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + custom_model=types.DeployRequest.CustomModel( + gcs_uri=_TEST_GCS_URI, + ), + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + min_replica_count=1, + max_replica_count=1, + ) + ), + ) + ) + + def test_deploy_custom_model_no_gcs_uri_raise_error(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + with pytest.raises(ValueError) as exception: + model = model_garden_preview.CustomModel() + model.deploy() + assert str(exception.value) == "gcs_uri must be specified." + + def test_deploy_custom_model_machine_type_only_raise_error(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + with pytest.raises(ValueError) as exception: + model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + model.deploy(machine_type="n1-standard-4") + assert ( + str(exception.value) + == "machine_type, accelerator_type and accelerator_count must all" + " be provided or not provided." + ) + + def test_deploy_custom_model_with_all_config_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + model.deploy( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + min_replica_count=2, + max_replica_count=3, + endpoint_display_name="custom-mode-endpoint", + model_display_name="custom-model-id", + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + custom_model=types.DeployRequest.CustomModel( + gcs_uri=_TEST_GCS_URI, + ), + model_config=types.DeployRequest.ModelConfig( + model_display_name="custom-model-id", + ), + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + min_replica_count=2, + max_replica_count=3, + machine_spec=types.MachineSpec( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + ), + ), + ), + endpoint_config=types.DeployRequest.EndpointConfig( + endpoint_display_name="custom-mode-endpoint", + ), + ) + ) + + def test_deploy_custom_model_with_psc_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + model.deploy( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + enable_private_service_connect=True, + psc_project_allow_list=["test-project"], + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + custom_model=types.DeployRequest.CustomModel( + gcs_uri=_TEST_GCS_URI, + ), + endpoint_config=types.DeployRequest.EndpointConfig( + private_service_connect_config=types.PrivateServiceConnectConfig( + enable_private_service_connect=True, + project_allowlist=["test-project"], + ), + ), + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + min_replica_count=1, + max_replica_count=1, + machine_spec=types.MachineSpec( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + ), + ), + ), + ) + ) + + def test_deploy_custom_model_with_reservation_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + model.deploy( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + reservation_affinity_type="SPECIFIC_RESERVATION", + reservation_affinity_key="compute.googleapis.com/reservation-name", + reservation_affinity_values=[ + "projects/test-project/zones/us-central1-a/reservations/test-reservation" + ], + ) + deploy_mock.assert_called_once_with( + types.DeployRequest( + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + custom_model=types.DeployRequest.CustomModel( + gcs_uri=_TEST_GCS_URI, + ), + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + min_replica_count=1, + max_replica_count=1, + machine_spec=types.MachineSpec( + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + reservation_affinity=types.ReservationAffinity( + reservation_affinity_type="SPECIFIC_RESERVATION", + key="compute.googleapis.com/reservation-name", + values=[ + "projects/test-project/zones/us-central1-a/reservations/test-reservation" + ], + ), + ), + ), + ), + ) + ) + + def test_deploy_custom_model_with_system_labels_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + model.deploy(system_labels={"test-key": "test-value"}) + deploy_mock.assert_called_once_with( + types.DeployRequest( + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + custom_model=types.DeployRequest.CustomModel( + gcs_uri=_TEST_GCS_URI, + ), + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + min_replica_count=1, + max_replica_count=1, + ), + system_labels={"test-key": "test-value"}, + ), + ) + ) + + @pytest.mark.parametrize("filter_by_user_quota", [True, False]) + def test_list_deploy_options_with_recommendations(self, filter_by_user_quota): + """Tests list_deploy_options when recommend_spec returns recommendations.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + mock_model_service_client = mock.Mock() + with mock.patch.object( + aiplatform.initializer.global_config, + "create_client", + return_value=mock_model_service_client, + ): + quota_state = types.RecommendSpecResponse.Recommendation.QuotaState + mock_response = types.RecommendSpecResponse( + recommendations=[ + types.RecommendSpecResponse.Recommendation( + spec=types.RecommendSpecResponse.MachineAndModelContainerSpec( + machine_spec=types.MachineSpec( + machine_type="n1-standard-4", + accelerator_type=types.AcceleratorType.NVIDIA_TESLA_T4, + accelerator_count=1, + ) + ), + region="us-central1", + user_quota_state=quota_state.QUOTA_STATE_USER_HAS_QUOTA, + ), + types.RecommendSpecResponse.Recommendation( + spec=types.RecommendSpecResponse.MachineAndModelContainerSpec( + machine_spec=types.MachineSpec( + machine_type="n1-standard-8", + accelerator_type=types.AcceleratorType.NVIDIA_TESLA_V100, + accelerator_count=2, + ) + ), + region="us-east1", + user_quota_state=quota_state.QUOTA_STATE_NO_USER_QUOTA, + ), + types.RecommendSpecResponse.Recommendation( + spec=types.RecommendSpecResponse.MachineAndModelContainerSpec( + machine_spec=types.MachineSpec( + machine_type="g2-standard-24", + accelerator_type=types.AcceleratorType.NVIDIA_L4, + accelerator_count=2, + ) + ), + region="us-central1", + user_quota_state=quota_state.QUOTA_STATE_UNSPECIFIED, + ), + ] + ) + mock_model_service_client.recommend_spec.return_value = mock_response + + custom_model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + result = custom_model.list_deploy_options( + filter_by_user_quota=filter_by_user_quota + ) + + if filter_by_user_quota: + expected_output = textwrap.dedent( + """\ + [Option 1] + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + region="us-central1", + user_quota_state="QUOTA_STATE_USER_HAS_QUOTA\"""" + ) + else: + expected_output = textwrap.dedent( + """\ + [Option 1] + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1, + region="us-central1", + user_quota_state="QUOTA_STATE_USER_HAS_QUOTA" + + [Option 2] + machine_type="n1-standard-8", + accelerator_type="NVIDIA_TESLA_V100", + accelerator_count=2, + region="us-east1", + user_quota_state="QUOTA_STATE_NO_USER_QUOTA" + + [Option 3] + machine_type="g2-standard-24", + accelerator_type="NVIDIA_L4", + accelerator_count=2, + region="us-central1\"""" + ) + assert result == expected_output + mock_model_service_client.recommend_spec.assert_called_once_with( + types.RecommendSpecRequest( + gcs_uri=_TEST_GCS_URI, + parent=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + check_machine_availability=True, + check_user_quota=filter_by_user_quota, + ), + timeout=60, + ) + + def test_list_deploy_options_with_specs(self): + """Tests list_deploy_options with available_machines set to False and recommend_spec returns all compatible specs.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + mock_model_service_client = mock.Mock() + with mock.patch.object( + aiplatform.initializer.global_config, + "create_client", + return_value=mock_model_service_client, + ): + mock_response = types.RecommendSpecResponse( + specs=[ + types.RecommendSpecResponse.MachineAndModelContainerSpec( + machine_spec=types.MachineSpec( + machine_type="n1-standard-4", + accelerator_type=types.AcceleratorType.NVIDIA_TESLA_T4, + accelerator_count=1, + ) + ), + types.RecommendSpecResponse.MachineAndModelContainerSpec( + machine_spec=types.MachineSpec( + machine_type="n1-standard-8", + accelerator_type=types.AcceleratorType.NVIDIA_TESLA_V100, + accelerator_count=2, + ) + ), + ] + ) + mock_model_service_client.recommend_spec.return_value = mock_response + + custom_model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + result = custom_model.list_deploy_options( + available_machines=False, filter_by_user_quota=False + ) + + expected_output = textwrap.dedent( + """\ + [Option 1] + machine_type="n1-standard-4", + accelerator_type="NVIDIA_TESLA_T4", + accelerator_count=1 + + [Option 2] + machine_type="n1-standard-8", + accelerator_type="NVIDIA_TESLA_V100", + accelerator_count=2""" + ) + assert result == expected_output + mock_model_service_client.recommend_spec.assert_called_once_with( + types.RecommendSpecRequest( + gcs_uri=_TEST_GCS_URI, + parent=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + check_machine_availability=False, + check_user_quota=False, + ), + timeout=60, + ) + + def test_list_deploy_options_exception(self): + """Tests list_deploy_options when recommend_spec raises an exception.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + mock_model_service_client = mock.Mock() + with mock.patch.object( + aiplatform.initializer.global_config, + "create_client", + return_value=mock_model_service_client, + ): + mock_model_service_client.recommend_spec.side_effect = ValueError( + "Test Error" + ) + custom_model = model_garden_preview.CustomModel(gcs_uri=_TEST_GCS_URI) + with pytest.raises(ValueError) as exception: + custom_model.list_deploy_options() + assert str(exception.value) == "Test Error" + mock_model_service_client.recommend_spec.assert_called_once() + + +class TestVertexAIModelGardenModel: + """Test cases for Model Garden Model class.""" + + def setup_method(self): + importlib.reload(aiplatform.initializer) + importlib.reload(aiplatform) + aiplatform.init(project=_TEST_PROJECT) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + def test_no_model_name_raises_error(self): + """Tests deploying a model with spot VM.""" + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + with pytest.raises(ValueError) as exception: + model_garden_preview.Model() + assert str(exception.value) == ("model_name must be specified.") + + def test_deploy_full_resource_name_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.Model(model_name=_TEST_MODEL_FULL_RESOURCE_NAME) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_simplified_resource_name_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.Model( + model_name=_TEST_MODEL_SIMPLIFIED_RESOURCE_NAME + ) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + publisher_model_name=_TEST_MODEL_FULL_RESOURCE_NAME, + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_hugging_face_id_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.Model(model_name=_TEST_MODEL_HUGGING_FACE_ID) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + hugging_face_model_id=_TEST_MODEL_HUGGING_FACE_ID.lower(), + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + ) + ) + + def test_deploy_gcs_uri_success(self, deploy_mock): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + model = model_garden_preview.Model(model_name=_TEST_GCS_URI) + model.deploy() + deploy_mock.assert_called_once_with( + types.DeployRequest( + destination=f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}", + custom_model=types.DeployRequest.CustomModel( + gcs_uri=_TEST_GCS_URI, + ), + deploy_config=types.DeployRequest.DeployConfig( + dedicated_resources=types.DedicatedResources( + min_replica_count=1, + max_replica_count=1, + ) + ), + ) + ) + + def test_deploy_model_registry_model_success(self): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + with pytest.raises(NotImplementedError) as exception: + model_garden_preview.Model(model_name=_TEST_MODEL_NAME) + assert str(exception.value) == "Model Registry models are not supported yet." diff --git a/tests/unit/vertexai/test_rubric_based_eval.py b/tests/unit/vertexai/test_rubric_based_eval.py index 557873f198..1fdc0e02cd 100644 --- a/tests/unit/vertexai/test_rubric_based_eval.py +++ b/tests/unit/vertexai/test_rubric_based_eval.py @@ -30,6 +30,7 @@ from vertexai.preview.evaluation.metrics import ( predefined_rubric_metrics, ) +import copy import pandas as pd import pytest @@ -281,7 +282,7 @@ def test_pointwise_text_quality_metric(self): with mock.patch.object( target=gapic_evaluation_services.EvaluationServiceClient, attribute="evaluate_instances", - side_effect=_MOCK_POINTWISE_RESPONSE, + side_effect=[copy.deepcopy(x) for x in _MOCK_POINTWISE_RESPONSE], ): eval_result = EvalTask( dataset=_TEST_EVAL_DATASET, metrics=[metric]