From e94237b6ad545682726b3f9a7f5ccd301fb70b8a Mon Sep 17 00:00:00 2001 From: Olusegun Durojaye Date: Mon, 8 Dec 2025 06:21:12 -0500 Subject: [PATCH 1/2] mlflow and tracking of trained models --- .../mlflow-tracking/README.md | 92 +++++++++++++++ .../mlflow-tracking/setup_mlflow_env.sh | 36 ++++++ .../mlflow-tracking/train_and_track.py | 109 ++++++++++++++++++ 3 files changed, 237 insertions(+) create mode 100644 examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/README.md create mode 100644 examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/setup_mlflow_env.sh create mode 100644 examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/train_and_track.py diff --git a/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/README.md b/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/README.md new file mode 100644 index 00000000..4cef68bf --- /dev/null +++ b/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/README.md @@ -0,0 +1,92 @@ +# πŸ“ˆ MLflow Experiment Tracking Template (GPU Ready) + +## 🌟 Overview + +This template provides a robust, reproducible framework for **tracking Deep Learning experiments** on GPU-accelerated hardware. It leverages **MLflow Tracking** to automatically log hyperparameters, model artifacts, and vital **GPU system utilization metrics** (memory, temperature, and usage) during the training process. + +This system is essential for comparing model performance and hardware efficiency across different runsβ€”a key capability for MLOps on platforms like **Saturn Cloud**. + +### Key Features + + * **GPU Readiness:** Dynamically detects and utilizes available CUDA devices. + * **Automatic Tracking:** Uses `mlflow.pytorch.autolog()` to capture hyperparameters and model architecture. + * **System Metrics:** Logs GPU/CPU usage and memory over time using `log_system_metrics=True`. + * **Centralized UI:** Easy verification and comparison of runs via the **MLflow UI table**. + +----- + +## πŸ› οΈ How to Run the Template + +### 1\. Project Setup (Bash Script) + +This script sets up a stable Python environment, installs PyTorch, MLflow, and the necessary GPU monitoring packages (`nvidia-ml-py`). + +#### File: `setup_mlflow_env.sh` + +#### Step A: Grant Execution Permission + +In your terminal, grant executable permission to the setup script. + +```bash +chmod +x setup_mlflow_env.sh +``` + +#### Step B: Execute the Setup + +Run the script to install all dependencies. + +```bash +./setup_mlflow_env.sh +``` + +----- + +### 2\. Procedures (Execution & Monitoring) + +#### Step C: Activate the Environment + +You must do this every time you open a new terminal session. + +```bash +source mlflow_gpu_env_stable/bin/activate +``` + +#### Step D: Configure Tracking Location + +The template uses the environment variable `MLFLOW_TRACKING_URI` to determine where to log data. + +| Mode | Configuration (Terminal Command) | Use Case | +| :--- | :--- | :--- | +| **Local (Default)** | (No command needed) | Development and testing where logs are written to the local `mlruns/` folder. | +| **Remote (Server)** | `export MLFLOW_TRACKING_URI="http://:5000"` | Production jobs requiring centralized, shared tracking (e.g., **Saturn Cloud Managed MLflow**). | + +#### Step E: Run the Tracking Sample + +Execute the main pipeline script (`train_and_track.py`). + +```bash +python train_and_track.py +``` + +#### Step F: Verification (Checking Tracked Data) + + * **Local UI Access:** If running locally, start the UI server: + ```bash + mlflow ui --host 0.0.0.0 --port 5000 + ``` + Then, access the exposed IP and port in your browser. + * **Remote UI Access:** Navigate to the host address of your remote tracking server. The **MLflow UI Table** will display the run, confirming successful logging of all parameters, metrics, and **GPU utilization** (see image above). + +----- + +## 4\. πŸ”— Conclusion and Scaling on Saturn Cloud + +This template successfully creates a fully observable training environment, fulfilling the core requirements of MLOps for GPU-accelerated workloads. All run detailsβ€”from hyperparameters to **GPU utilization metrics**β€”are now centralized and ready for comparison. + +To maximize performance, streamline infrastructure management, and integrate MLOps practices, deploy this template on **Saturn Cloud**: + + * **Official Saturn Cloud Website:** [Saturn Cloud](https://saturncloud.io/) + * **MLOps Guide:** Saturn Cloud enables a robust MLOps lifecycle by simplifying infrastructure, scaling, and experiment tracking. [A Practical Guide to MLOps](https://saturncloud.io/docs/design-principles/concepts/mlops/) + * **GPU Clusters:** Easily provision and manage GPU-equipped compute resources, including high-performance NVIDIA A100/H100 GPUs, directly within **Saturn Cloud**. [Saturn Cloud Documentation](https://saturncloud.io/docs/user-guide/) + +**Start building your scalable MLOps pipeline today on Saturn Cloud\!** \ No newline at end of file diff --git a/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/setup_mlflow_env.sh b/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/setup_mlflow_env.sh new file mode 100644 index 00000000..e04726bf --- /dev/null +++ b/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/setup_mlflow_env.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +ENV_NAME="mlflow_gpu_env_stable" +PYTHON_VERSION="3.12" +CUDA_VERSION="12" + +echo "=================================================" +echo "πŸš€ Setting up MLflow GPU Tracking Environment (Python $PYTHON_VERSION)" +echo "=================================================" + +# --- 1. Create and Activate Stable VENV --- +rm -rf $ENV_NAME +python$PYTHON_VERSION -m venv $ENV_NAME +source $ENV_NAME/bin/activate +echo "βœ… Virtual Environment created and activated." + +# --- 2. Install Core Libraries --- +echo "--- Installing Core MLflow and PyTorch Libraries ---" + +# Install PyTorch (GPU version for CUDA 12.1) +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + +# Install MLflow and helper libraries +pip install mlflow==2.11.3 numpy scikit-learn pandas + +# --- 3. Replace Deprecated PYNVML for System Metrics --- +echo "--- Replacing deprecated pynvml with nvidia-ml-py ---" + +# Uninstall old package (if it exists) +pip uninstall -y pynvml + +# Install the correct GPU monitoring package and prerequisites +pip install psutil nvidia-ml-py + +echo "--- Installation Complete ---" +echo "βœ… Environment is ready. Run 'source $ENV_NAME/bin/activate' before executing the Python script." \ No newline at end of file diff --git a/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/train_and_track.py b/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/train_and_track.py new file mode 100644 index 00000000..1e2b7628 --- /dev/null +++ b/examples/MLOps_Ops_and_Enterprise_Features/mlflow-tracking/train_and_track.py @@ -0,0 +1,109 @@ +import os +import time +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import TensorDataset, DataLoader + +import mlflow +import mlflow.pytorch +import numpy as np + +# --- Configuration --- +# 1. MLflow Tracking URI (MLflow server or local './mlruns') +MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI", "file:./mlruns") +MLFLOW_EXPERIMENT_NAME = "GPU_DeepLearning_RunPod" + +# 2. Hyperparameters (These will be automatically logged by mlflow.pytorch.autolog()) +# Note: Autologging handles logging the optimizer details and LR automatically. +PARAMS = { + "learning_rate": 0.001, + "epochs": 5, + "batch_size": 32, + "model_type": "SimpleConvNet", + "optimizer": "Adam" +} + +# --- PyTorch Model Definition --- +class SimpleConvNet(nn.Module): + def __init__(self): + super(SimpleConvNet, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.relu = nn.ReLU() + self.fc = nn.Linear(10 * 24 * 24, 1) + + def forward(self, x): + x = self.relu(self.conv1(x)) + x = x.view(-1, 10 * 24 * 24) + x = self.fc(x) + return x + +def train_and_log(device): + + # --- 1. MLflow Setup --- + mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) + mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME) + + # 2. ENABLE AUTOLOGGING: Automatically logs model, params, and metrics (except custom loops) + mlflow.pytorch.autolog(log_models=True, log_datasets=False) # + + # 3. START RUN: Enable system metrics logging inside the run context + with mlflow.start_run(run_name="GPU_Train_Run", log_system_metrics=True) as run: + + # Log system information manually (GPU type and custom params not auto-logged) + if device.type == 'cuda': + mlflow.log_param("gpu_device", torch.cuda.get_device_name(0)) + mlflow.log_params(PARAMS) + + # --- Training Execution --- + print(f"Starting training on device: {device} with LR={PARAMS['learning_rate']}") + + # Simulate Data Setup + data = torch.randn(100, 1, 28, 28, device=device) + labels = torch.randint(0, 2, (100, 1), dtype=torch.float32, device=device) + dataloader = DataLoader(TensorDataset(data, labels), batch_size=PARAMS['batch_size']) + + model = SimpleConvNet().to(device) + optimizer = optim.Adam(model.parameters(), lr=PARAMS['learning_rate']) + criterion = nn.BCEWithLogitsLoss() + + # Training Loop + for epoch in range(PARAMS['epochs']): + total_loss = 0.0 + + for inputs, targets in dataloader: + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + total_loss += loss.item() + + avg_loss = total_loss / len(dataloader) + + # Manually log the primary metric (optional, as autolog might cover this in integrated loops) + mlflow.log_metric("avg_loss_manual", avg_loss, step=epoch) + + print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}") + + # 4. Final Logging + mlflow.log_metric("final_loss", avg_loss) + + # Note: Model and optimizer params are logged automatically by mlflow.pytorch.autolog() + + print("\nβœ… Training complete.") + print(f"MLflow Run ID: {run.info.run_id}") + + +def main(): + if torch.cuda.is_available(): + device = torch.device("cuda") + print("πŸ’‘ GPU detected and available.") + else: + device = torch.device("cpu") + print("⚠️ GPU not detected. Running on CPU.") + + train_and_log(device) + +if __name__ == "__main__": + main() \ No newline at end of file From 0600648b32912a9fef7f751e70dda08782897014 Mon Sep 17 00:00:00 2001 From: Olusegun Durojaye Date: Tue, 9 Dec 2025 09:10:46 -0500 Subject: [PATCH 2/2] cost-performance benchmarkins --- .../cost_benchmark/README.md | 105 ++++++++++++ .../cost_benchmark/cost_benchmark.py | 155 ++++++++++++++++++ .../cost_benchmark/setup_benchmark_env.sh | 20 +++ 3 files changed, 280 insertions(+) create mode 100644 examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/README.md create mode 100644 examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/cost_benchmark.py create mode 100644 examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/setup_benchmark_env.sh diff --git a/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/README.md b/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/README.md new file mode 100644 index 00000000..966de34d --- /dev/null +++ b/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/README.md @@ -0,0 +1,105 @@ +# πŸ’° Cost/Performance Benchmark + +## 🌟 Overview + +This template provides a crucial framework for **FinOps (Financial Operations)** by running a **Cost/Performance Benchmark** on deep learning tasks. It accurately measures the trade-off between speed and cost, providing data to answer the core question: *Which hardware configuration delivers the best performance per dollar?* + +It uses a **custom Python logger** to record key metrics, generating a structured report that can be used to compare different machine types (e.g., A100 vs. V100, or CPU vs. GPU). + +### Key Metrics Tracked + + * **Cost/Epoch:** Calculated estimated cost based on the configured hourly rate. + * **Tokens/sec:** Measures the raw speed/throughput of the hardware. + * **Job Summary:** Provides total estimated cost and total execution time. + * **Hardware:** Tracks CPU vs. GPU execution path. + +----- + +## πŸ› οΈ Implementation Details + +### 1\. Project Setup (Bash Script) + +Save the following as `setup_benchmark_env.sh`. This script installs the necessary PyTorch library and configuration. + +```bash +#!/bin/bash + +ENV_NAME="cost_benchmark_env" +PYTHON_VERSION="3.11" + +echo "=================================================" +echo "πŸš€ Setting up Cost/Performance Benchmark Environment" +echo "=================================================" + +# 1. Create and Activate Stable VENV +rm -rf $ENV_NAME +python3.$PYTHON_VERSION -m venv $ENV_NAME +source $ENV_NAME/bin/activate + +# 2. Install PyTorch (Required for accurate CUDA event timing) +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + +# 3. Install Helpers +pip install numpy pandas psutil + +echo "--- Installation Complete ---" +``` + +#### Execution + +1. **Grant Permission:** `chmod +x setup_benchmark_env.sh` +2. **Run Setup:** `./setup_benchmark_env.sh` + +----- + +### 2\. Procedures (Job Execution) + +#### Step A: Activate the Environment + +```bash +source cost_benchmark_env/bin/activate +``` + +#### Step B: Configure Pricing (CRITICAL) + +Before running the script, you **must** update the `GPU_HOURLY_RATE` constant in `cost_benchmark.py` to reflect the actual hourly cost of the machine you are testing on Saturn Cloud. + +```python +# --- Configuration & Constants in cost_benchmark.py --- +# UPDATE THIS VALUE MANUALLY based on your Saturn Cloud instance type +GPU_HOURLY_RATE = 3.20 # Example $/hour for a high-end GPU (must be updated manually) +``` + +#### Step C: Run the Benchmark + +Execute the Python script (`cost_benchmark.py`). + +```bash +python cost_benchmark.py +``` + +### Verification and Reporting + +The script will generate structured output to the console and a persistent file named **`benchmark_results.log`**. + +| Log Entry Example | Metric Significance | +| :--- | :--- | +| `Time: 0.0500s` | Raw speed (lower is better). | +| `Cost: $0.00004` | **Cost/Epoch** (lower is better for efficiency). | +| `Tokens/s: 6400` | **Throughput/Speed** (higher is better for performance). | + +This log file serves as the definitive source for generating a comparative chart (Cost/Epoch vs. Tokens/sec) for optimal rightsizing. + +----- + +## 4\. πŸ”— Conclusion and Scaling on Saturn Cloud + +The **Cost/Performance Benchmark** template is fundamental to the **Optimize** phase of the FinOps lifecycle. By quantifying the true expense of your speed, you can make data-driven decisions to reduce cloud waste. + +To operationalize this benchmarking practice, **Saturn Cloud** offers the ideal platform: + + * **FinOps Integration:** Saturn Cloud is an all-in-one solution for data science and MLOps, essential for implementing robust FinOps practices. + * **Rightsizing and Optimization:** Easily run this job on different GPU types within Saturn Cloud to determine the most cost-effective solution before deploying models to production. [Saturn Cloud MLOps Documentation](https://www.saturncloud.io/docs/design-principles/concepts/mlops/) + * **Building a Cost-Conscious Culture:** Integrate cost awareness directly into your MLOps pipeline, aligning technical performance with financial goals. [Saturn Cloud Homepage](https://saturncloud.io/) + +**Optimize your cloud spend by deploying this template on Saturn Cloud\!** \ No newline at end of file diff --git a/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/cost_benchmark.py b/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/cost_benchmark.py new file mode 100644 index 00000000..3e7a3a86 --- /dev/null +++ b/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/cost_benchmark.py @@ -0,0 +1,155 @@ +import os +import time +import torch +import torch.nn as nn +import torch.optim as optim +import logging +import sys +import numpy as np + +# --- Configuration & Constants --- +# Use the correct GPU pricing for your cloud provider (e.g., Saturn Cloud, AWS, GCP) +# Example: NVIDIA A100 pricing (approximate, for demonstration) +GPU_HOURLY_RATE = 100000 # $/hour for a high-end GPU (Must be updated manually) +LOG_FILE = "benchmark_results.log" + +# Hyperparameters for the simulated job +EPOCHS = 5 +BATCH_SIZE = 32 +TOTAL_SAMPLES = 50000 +TOTAL_TOKENS_PER_SAMPLE = 100 # Represents tokens in an NLP task or features in an image +TOTAL_TOKENS = TOTAL_SAMPLES * TOTAL_TOKENS_PER_SAMPLE + +# --- Custom Logger Setup --- + +def setup_logger(): + """Configures the logger to write structured output to a file.""" + # Create the logger object + logger = logging.getLogger('BenchmarkLogger') + logger.setLevel(logging.INFO) + + # Define a custom format that includes time and specific placeholders + # We use a custom format to easily parse the final report later + formatter = logging.Formatter( + '%(asctime)s | %(levelname)s | %(message)s' + ) + + # File Handler + file_handler = logging.FileHandler(LOG_FILE, mode='w') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + # Console Handler (for real-time feedback) + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + return logger + +# --- Model & Timing Functions --- + +class SimpleModel(nn.Module): + def __init__(self, input_size, output_size): + super().__init__() + self.linear = nn.Linear(input_size, output_size) + def forward(self, x): + return self.linear(x) + +def run_training_benchmark(logger, device): + + logger.info(f"--- STARTING BENCHMARK ON {device.type.upper()} ---") + + # Configuration based on device + INPUT_SIZE = 512 + OUTPUT_SIZE = 1 + + # Model and Data Setup (on the target device) + model = SimpleModel(INPUT_SIZE, OUTPUT_SIZE).to(device) + dummy_input = torch.randn(BATCH_SIZE, INPUT_SIZE, device=device) + dummy_target = torch.randn(BATCH_SIZE, OUTPUT_SIZE, device=device) + optimizer = optim.Adam(model.parameters()) + criterion = nn.MSELoss() + + # Total estimated cost + total_estimated_cost = 0.0 + + # Synchronization is crucial for accurate GPU timing + if device.type == 'cuda': + # Warm-up run is necessary to avoid compilation time bias + logger.info("Performing CUDA warm-up run...") + _ = model(dummy_input) + torch.cuda.synchronize() + + # Start timing the entire job + job_start_time = time.time() + + for epoch in range(1, EPOCHS + 1): + + if device.type == 'cuda': + # Use synchronized CUDA events for precise timing + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + else: + start_event = time.time() + + # --- Simulated Training Step --- + optimizer.zero_grad() + output = model(dummy_input) + loss = criterion(output, dummy_target) + loss.backward() + optimizer.step() + # --- End Simulated Training Step --- + + if device.type == 'cuda': + end_event.record() + torch.cuda.synchronize() # Wait for GPU to finish + # elapsed_time returns milliseconds, convert to seconds + epoch_time_s = start_event.elapsed_time(end_event) / 1000.0 + else: + epoch_time_s = time.time() - start_event + + # --- COST AND PERFORMANCE CALCULATION --- + + # 1. Cost Calculation + cost_per_epoch = (epoch_time_s / 3600.0) * GPU_HOURLY_RATE + total_estimated_cost += cost_per_epoch + + # 2. Performance Calculation (Throughput) + throughput_samples_sec = BATCH_SIZE / epoch_time_s + throughput_tokens_sec = (BATCH_SIZE * TOTAL_TOKENS_PER_SAMPLE) / epoch_time_s + + # --- LOGGING THE RESULTS --- + logger.info( + f"EPOCH: {epoch}/{EPOCHS} | " + f"Time: {epoch_time_s:.4f}s | " + f"Cost: ${cost_per_epoch:.5f} | " + f"Tokens/s: {throughput_tokens_sec:.0f}" + ) + + job_total_time = time.time() - job_start_time + + # --- FINAL REPORT --- + logger.info("--- JOB SUMMARY ---") + logger.info(f"FINAL_COST: ${total_estimated_cost:.4f}") + logger.info(f"TOTAL_TIME: {job_total_time:.2f}s") + logger.info(f"TOTAL_TOKENS_PROCESSED: {TOTAL_TOKENS * EPOCHS}") + logger.info(f"-------------------") + + +def main(): + logger = setup_logger() + logger.info(f"Configuration: GPU Hourly Rate = ${GPU_HOURLY_RATE}/hr") + + # 1. Check for GPU availability + if torch.cuda.is_available(): + device = torch.device("cuda") + logger.info("GPU detected. Running GPU Benchmark.") + else: + device = torch.device("cpu") + logger.warning("GPU not detected. Running CPU Benchmark.") + + run_training_benchmark(logger, device) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/setup_benchmark_env.sh b/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/setup_benchmark_env.sh new file mode 100644 index 00000000..c2bedaef --- /dev/null +++ b/examples/MLOps_Ops_and_Enterprise_Features/cost_benchmark/setup_benchmark_env.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +ENV_NAME="cost_benchmark_env" +PYTHON_VERSION="3.12" + +echo "--- Setting up Cost/Performance Benchmark Environment ---" + +# 1. Create and Activate Stable VENV +rm -rf $ENV_NAME +python$PYTHON_VERSION -m venv $ENV_NAME +source $ENV_NAME/bin/activate + +# 2. Install PyTorch (GPU version for CUDA 12) +# We need PyTorch for accurate CUDA timing events. +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + +# 3. Install Helpers +pip install numpy pandas psutil + +echo "βœ… Environment setup complete." \ No newline at end of file