From 403bdafa99c0d74a9433c83d02ebd0265c2594bd Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Thu, 1 May 2025 15:49:15 -0700 Subject: [PATCH 1/5] Improve error logging and documentation for issue 4007 --- doc/frameworks/pytorch/using_pytorch.rst | 46 ++++++++++++++++++++++++ src/sagemaker/utils.py | 35 ++++++++++++------ 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst index 4141dd84db..4c5d7c9cae 100644 --- a/doc/frameworks/pytorch/using_pytorch.rst +++ b/doc/frameworks/pytorch/using_pytorch.rst @@ -1048,6 +1048,43 @@ see `For versions 1.1 and lower <#for-versions-1.1-and-lower>`_. Where ``requirements.txt`` is an optional file that specifies dependencies on third-party libraries. +Important Packaging Instructions +------------------------------ + +When creating your model artifact (``model.tar.gz``), follow these steps to avoid common deployment issues: + +1. Navigate to the directory containing your model files: + + .. code:: bash + + cd my_model + +2. Create the tar archive from within this directory: + + .. code:: bash + + tar czvf ../model.tar.gz * + +**Common Mistakes to Avoid:** + +* Do NOT create the archive from the parent directory using ``tar czvf model.tar.gz my_model/``. + This creates an extra directory level that will cause deployment errors. +* Ensure ``inference.py`` is directly under the ``code/`` directory in your archive. +* Verify your archive structure using: + + .. code:: bash + + tar tvf model.tar.gz + + You should see output similar to: + + :: + + model.pth + code/ + code/inference.py + code/requirements.txt + Create a ``PyTorchModel`` object -------------------------------- @@ -1066,6 +1103,15 @@ Now call the :class:`sagemaker.pytorch.model.PyTorchModel` constructor to create Now you can call the ``predict()`` method to get predictions from your deployed model. +Troubleshooting +-------------- + +If you encounter a ``FileNotFoundError`` for ``inference.py``, check: + +1. That your model artifact is packaged correctly following the instructions above +2. The structure of your ``model.tar.gz`` file matches the expected layout +3. You're creating the archive from within the model directory, not from its parent + *********************************************** Attach an estimator to an existing training job *********************************************** diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 1a75a3a5cc..617f8459d7 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -13,10 +13,12 @@ """Placeholder docstring""" from __future__ import absolute_import +import abc import contextlib import copy import errno import inspect +import json import logging import os import random @@ -25,31 +27,30 @@ import tarfile import tempfile import time -from functools import lru_cache -from typing import Union, Any, List, Optional, Dict -import json -import abc import uuid from datetime import datetime -from os.path import abspath, realpath, dirname, normpath, join as joinpath - +from functools import lru_cache from importlib import import_module +from os.path import abspath, dirname +from os.path import join as joinpath +from os.path import normpath, realpath +from typing import Any, Dict, List, Optional, Union import boto3 import botocore from botocore.utils import merge_dicts -from six.moves.urllib import parse from six import viewitems +from six.moves.urllib import parse from sagemaker import deprecations from sagemaker.config import validate_sagemaker_config from sagemaker.config.config_utils import ( - _log_sagemaker_config_single_substitution, _log_sagemaker_config_merge, + _log_sagemaker_config_single_substitution, ) from sagemaker.enums import RoutingStrategy from sagemaker.session_settings import SessionSettings -from sagemaker.workflow import is_pipeline_variable, is_pipeline_parameter_string +from sagemaker.workflow import is_pipeline_parameter_string, is_pipeline_variable from sagemaker.workflow.entities import PipelineVariable ALTERNATE_DOMAINS = { @@ -624,7 +625,21 @@ def _create_or_update_code_dir( if os.path.exists(os.path.join(code_dir, inference_script)): pass else: - raise + raise FileNotFoundError( + f"Could not find '{inference_script}'. Common solutions:\n" + "1. Make sure inference.py exists in the code/ directory\n" + "2. Package your model correctly:\n" + " - ✅ DO: Navigate to the directory containing model files and run:\n" + " cd /path/to/model_files\n" + " tar czvf ../model.tar.gz *\n" + " - ❌ DON'T: Create from parent directory:\n" + " tar czvf model.tar.gz model/\n" + "\nExpected structure in model.tar.gz:\n" + " ├── model.pth (or your model file)\n" + " └── code/\n" + " ├── inference.py\n" + " └── requirements.txt" + ) for dependency in dependencies: lib_dir = os.path.join(code_dir, "lib") From a25ce1fcfd77888d28c1322efab49823177fab67 Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Fri, 2 May 2025 11:15:00 -0700 Subject: [PATCH 2/5] Fix a whitespace --- doc/frameworks/pytorch/using_pytorch.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst index 4c5d7c9cae..9bd48ef984 100644 --- a/doc/frameworks/pytorch/using_pytorch.rst +++ b/doc/frameworks/pytorch/using_pytorch.rst @@ -1049,7 +1049,7 @@ see `For versions 1.1 and lower <#for-versions-1.1-and-lower>`_. Where ``requirements.txt`` is an optional file that specifies dependencies on third-party libraries. Important Packaging Instructions ------------------------------- +-------------------------------- When creating your model artifact (``model.tar.gz``), follow these steps to avoid common deployment issues: @@ -1067,7 +1067,7 @@ When creating your model artifact (``model.tar.gz``), follow these steps to avoi **Common Mistakes to Avoid:** -* Do NOT create the archive from the parent directory using ``tar czvf model.tar.gz my_model/``. +* Do NOT create the archive from the parent directory using ``tar czvf model.tar.gz my_model/``. This creates an extra directory level that will cause deployment errors. * Ensure ``inference.py`` is directly under the ``code/`` directory in your archive. * Verify your archive structure using: @@ -1104,7 +1104,7 @@ Now call the :class:`sagemaker.pytorch.model.PyTorchModel` constructor to create Now you can call the ``predict()`` method to get predictions from your deployed model. Troubleshooting --------------- +--------------- If you encounter a ``FileNotFoundError`` for ``inference.py``, check: From fd20c3f241a4140cf596d77756a4db9263c4df0d Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Sun, 4 May 2025 20:16:47 -0700 Subject: [PATCH 3/5] Add hyperlink to RTDs --- src/sagemaker/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 617f8459d7..c3896e32a3 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -639,6 +639,8 @@ def _create_or_update_code_dir( " └── code/\n" " ├── inference.py\n" " └── requirements.txt" + "\nFor more details, see the documentation:\n" + "https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#bring-your-own-model" ) for dependency in dependencies: From 72aacf8329265349afce6516fcad520fbd353a5a Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Sun, 4 May 2025 20:27:45 -0700 Subject: [PATCH 4/5] Condense line to < 120 --- src/sagemaker/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index c3896e32a3..18e3ae75a0 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -625,6 +625,10 @@ def _create_or_update_code_dir( if os.path.exists(os.path.join(code_dir, inference_script)): pass else: + docs_url = ( + "https://sagemaker.readthedocs.io/en/stable/" + "frameworks/pytorch/using_pytorch.html#bring-your-own-model" + ) raise FileNotFoundError( f"Could not find '{inference_script}'. Common solutions:\n" "1. Make sure inference.py exists in the code/ directory\n" @@ -640,7 +644,7 @@ def _create_or_update_code_dir( " ├── inference.py\n" " └── requirements.txt" "\nFor more details, see the documentation:\n" - "https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#bring-your-own-model" + f"{docs_url}" ) for dependency in dependencies: From 4f9c9a5e40c3ee77b94b1e3dfc2df2f37aaa3b32 Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Mon, 5 May 2025 08:32:00 -0700 Subject: [PATCH 5/5] Better doc link --- src/sagemaker/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 18e3ae75a0..d4faa5ad9f 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -625,10 +625,6 @@ def _create_or_update_code_dir( if os.path.exists(os.path.join(code_dir, inference_script)): pass else: - docs_url = ( - "https://sagemaker.readthedocs.io/en/stable/" - "frameworks/pytorch/using_pytorch.html#bring-your-own-model" - ) raise FileNotFoundError( f"Could not find '{inference_script}'. Common solutions:\n" "1. Make sure inference.py exists in the code/ directory\n" @@ -642,9 +638,10 @@ def _create_or_update_code_dir( " ├── model.pth (or your model file)\n" " └── code/\n" " ├── inference.py\n" - " └── requirements.txt" + " └── requirements.txt\n" "\nFor more details, see the documentation:\n" - f"{docs_url}" + + "https://sagemaker.readthedocs.io/en/stable/" + + "frameworks/pytorch/using_pytorch.html#bring-your-own-model" ) for dependency in dependencies: