WikarNotAvailable
diff --git a/‎scripts/models/kmeans/Benchmark_inference.ipynb‎
Lines changed: 261 additions & 0 deletions b/‎scripts/models/kmeans/Benchmark_inference.ipynb‎
Lines changed: 261 additions & 0 deletions
diff --git a/‎scripts/models/kmeans/Elbow.ipynb‎
Lines changed: 95506 additions & 0 deletions b/‎scripts/models/kmeans/Elbow.ipynb‎
Lines changed: 95506 additions & 0 deletions
diff --git a/‎scripts/models/kmeans/calculate_metrics.py‎
Lines changed: 96 additions & 0 deletions b/‎scripts/models/kmeans/calculate_metrics.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎scripts/models/kmeans/inference.ipynb‎
Lines changed: 225 additions & 0 deletions b/‎scripts/models/kmeans/inference.ipynb‎
Lines changed: 225 additions & 0 deletions
diff --git a/‎scripts/models/kmeans/results/Batch_1mln_Epoch_1.jpeg‎
28.3 KB b/‎scripts/models/kmeans/results/Batch_1mln_Epoch_1.jpeg‎
28.3 KB
diff --git a/‎scripts/models/kmeans/results/Batch_5k_Epoch_1.jpeg‎
28.6 KB b/‎scripts/models/kmeans/results/Batch_5k_Epoch_1.jpeg‎
28.6 KB
diff --git a/‎scripts/models/kmeans/results/Batch_5k_Epoch_5.jpeg‎
31.3 KB b/‎scripts/models/kmeans/results/Batch_5k_Epoch_5.jpeg‎
31.3 KB
diff --git a/‎scripts/models/kmeans/results/best_confusion_matrix.png‎
30.7 KB b/‎scripts/models/kmeans/results/best_confusion_matrix.png‎
30.7 KB
diff --git a/‎scripts/models/kmeans/results/final_Batch_5k_Epoch_1.jpeg‎
29.8 KB b/‎scripts/models/kmeans/results/final_Batch_5k_Epoch_1.jpeg‎
29.8 KB
@@ -0,0 +1,96 @@
+import pandas as pd
+from sklearn.metrics import classification_report
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Load the data
+data_file = "test_data.csv.out"
+df = pd.read_csv(data_file, header=None, names=["label", "cluster", "distance"])
+
+# Ensure the label column is in boolean format
+df["label"] = df["label"].astype(bool)
+
+# Define the range of quantiles to test
+quantile_values = [x / 1000 for x in range(500, 1000)]  # From 0.500 to 0.999 in steps of 0.001
+
+# Store results for each quantile
+results = []
+
+for quantile in quantile_values:
+    # Calculate the threshold
+    threshold = df["distance"].quantile(quantile)
+    
+    # Mark anomalies based on the threshold
+    df["is_anomaly"] = df["distance"] > threshold
+
+    # Generate classification report
+    report = classification_report(
+        df["label"], df["is_anomaly"],
+        target_names=["Normal", "Anomaly"],
+        labels=[False, True],  # Explicitly define the label order
+        output_dict=True
+    )
+
+    # Extract relevant metrics
+    macro_avg_recall = report["macro avg"]["recall"]
+    weighted_avg_recall = report["weighted avg"]["recall"]
+
+    # Store results
+    results.append({
+        "quantile": quantile,
+        "threshold": threshold,
+        "macro_avg_recall": macro_avg_recall,
+        "weighted_avg_recall": weighted_avg_recall,
+        "avg_recall": (macro_avg_recall + weighted_avg_recall) / 2,
+        "classification_report_dict": report
+    })
+
+# Convert results to a DataFrame
+results_df = pd.DataFrame(results)
+
+# Find the best quantile based on the average recall
+results_df["avg_recall"] = (results_df["macro_avg_recall"] + results_df["weighted_avg_recall"]) / 2
+best_result = results_df.loc[results_df["avg_recall"].idxmax()]
+
+# Extract the best classification report in dictionary format
+best_report_dict = best_result["classification_report_dict"]
+
+# Convert the dictionary classification report to a formatted string
+df["is_anomaly"] = df["distance"] > best_result["threshold"]
+formatted_report = classification_report(
+    df["label"], df["is_anomaly"], target_names=["Normal", "Anomaly"], labels=[False, True]
+)
+
+# Print the detailed report
+report_text = f"""
+Detailed Report:
+
+Best Quantile: {best_result['quantile']}
+Threshold: {best_result['threshold']}
+
+Metrics:
+- Macro Avg Recall: {best_result['macro_avg_recall']:.4f}
+- Weighted Avg Recall: {best_result['weighted_avg_recall']:.4f}
+- Average Recall: {best_result['avg_recall']:.4f}
+
+Classification Report:
+{formatted_report}
+
+Full results saved to: quantile_analysis_results.csv
+"""
+print(report_text)
+
+# Save results to a CSV file
+results_df.to_csv("quantile_analysis_results.csv", index=False)
+
+# Generate and save confusion matrix for the best quantile
+best_threshold = best_result["threshold"]
+conf_matrix = pd.crosstab(df["label"], df["is_anomaly"], rownames=["True"], colnames=["Predicted"], dropna=False)
+
+plt.figure(figsize=(8, 6))
+sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal", "Anomaly"], yticklabels=["Normal", "Anomaly"])
+plt.xlabel("Predicted Label")
+plt.ylabel("True Label")
+plt.title(f"Confusion Matrix (Quantile {best_result['quantile']:.3f}, Threshold {best_threshold:.2f})")
+plt.savefig("best_confusion_matrix.png")
+plt.close()
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7dfeb162",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pydantic/_internal/_fields.py:192: UserWarning: Field name \"json\" in \"MonitoringDatasetFormat\" shadows an attribute in parent \"Base\"\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/24/25 16:55:26] </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO    </span> Found credentials from IAM Role:                                   <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1075</span></a>\n",
+       "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         BaseNotebookInstanceEc2InstanceRole                                <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m[01/24/25 16:55:26]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO    \u001b[0m Found credentials from IAM Role:                                   \u001b]8;id=580850;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=355709;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\u001b\\\u001b[2m1075\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                    \u001b[0m         BaseNotebookInstanceEc2InstanceRole                                \u001b[2m                   \u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n",
+      "sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/24/25 16:55:30] </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO    </span> Found credentials from IAM Role:                                   <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1075</span></a>\n",
+       "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         BaseNotebookInstanceEc2InstanceRole                                <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m[01/24/25 16:55:30]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO    \u001b[0m Found credentials from IAM Role:                                   \u001b]8;id=735281;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=225806;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\u001b\\\u001b[2m1075\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                    \u001b[0m         BaseNotebookInstanceEc2InstanceRole                                \u001b[2m                   \u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO    </span> Found credentials from IAM Role:                                   <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1075</span></a>\n",
+       "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         BaseNotebookInstanceEc2InstanceRole                                <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO    \u001b[0m Found credentials from IAM Role:                                   \u001b]8;id=233629;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=672198;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\u001b\\\u001b[2m1075\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                    \u001b[0m         BaseNotebookInstanceEc2InstanceRole                                \u001b[2m                   \u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO    </span> Same images used for training and inference. Defaulting to image     <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">image_uris.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#391\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">391</span></a>\n",
+       "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         scope: inference.                                                    <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                 </span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO    \u001b[0m Same images used for training and inference. Defaulting to image     \u001b]8;id=483594;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\u001b\\\u001b[2mimage_uris.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=138965;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#391\u001b\\\u001b[2m391\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                    \u001b[0m         scope: inference.                                                    \u001b[2m                 \u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO    </span> Ignoring unnecessary instance type: <span style=\"color: #e100e1; text-decoration-color: #e100e1; font-style: italic\">None</span>.                            <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">image_uris.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#528\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">528</span></a>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO    \u001b[0m Ignoring unnecessary instance type: \u001b[3;38;2;225;0;225mNone\u001b[0m.                            \u001b]8;id=443075;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\u001b\\\u001b[2mimage_uris.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=3198;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#528\u001b\\\u001b[2m528\u001b[0m\u001b]8;;\u001b\\\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running batch inference for k=6...\n",
+      "Model data URL for k=6: s3://bdp-models/kmeans/k_6/kmeans-2025-01-22-18-29-22-012/output/model.tar.gz\n",
+      "Error registering model kmeans-k-6: An error occurred (ValidationException) when calling the CreateModel operation: Cannot create already existing model \"arn:aws:sagemaker:eu-north-1:982534349340:model/kmeans-k-6\".\n"
+     ]
+    }
+   ],
+   "source": [
+    "import boto3\n",
+    "import sagemaker\n",
+    "from sagemaker import Session\n",
+    "from sagemaker.transformer import Transformer\n",
+    "from botocore.exceptions import ClientError\n",
+    "\n",
+    "\n",
+    "s3_client = boto3.client('s3', region_name=\"eu-north-1\")\n",
+    "\n",
+    "def find_model_path(bucket_name, prefix):\n",
+    "    try:\n",
+    "        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)\n",
+    "        for obj in response.get('Contents', []):\n",
+    "            if obj['Key'].endswith('model.tar.gz'):\n",
+    "                return f\"s3://{bucket_name}/{obj['Key']}\"\n",
+    "        raise FileNotFoundError(f\"No model.tar.gz found under prefix {prefix}\")\n",
+    "    except ClientError as e:\n",
+    "        print(f\"Error listing objects in S3: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "# SageMaker and boto3 settings\n",
+    "region = \"eu-north-1\"\n",
+    "boto_session = boto3.Session(region_name=region)\n",
+    "sagemaker_client = boto3.client(\"sagemaker\", region_name=region)\n",
+    "sagemaker_session = Session(boto_session=boto_session)\n",
+    "role = sagemaker.get_execution_role()\n",
+    "image_uri = sagemaker.image_uris.retrieve(\"kmeans\", region=region)\n",
+    "\n",
+    "\n",
+    "k_values = range(6, 7)\n",
+    "\n",
+    "for k in k_values:\n",
+    "    print(f\"Running batch inference for k={k}...\")\n",
+    "\n",
+    "    prefix = f\"kmeans/k_{k}/\"\n",
+    "    model_path = find_model_path(bucket_name=\"bdp-models\", prefix=prefix)\n",
+    "\n",
+    "    print(f\"Model data URL for k={k}: {model_path}\")\n",
+    "\n",
+    "    # Define output path\n",
+    "    output_path = f\"s3://bdp-inference-results/kmeans/k_{k}/\"\n",
+    "\n",
+    "    # Register the SageMaker model\n",
+    "    model_name = f\"kmeans-k-{k}\"\n",
+    "    try:\n",
+    "        sagemaker_client.create_model(\n",
+    "            ModelName=model_name,\n",
+    "            PrimaryContainer={\n",
+    "                \"Image\": image_uri,\n",
+    "                \"ModelDataUrl\": model_path,\n",
+    "                \"Environment\": {},\n",
+    "            },\n",
+    "            ExecutionRoleArn=role,\n",
+    "        )\n",
+    "        print(f\"Model {model_name} registered successfully.\")\n",
+    "    except sagemaker_client.exceptions.ClientError as e:\n",
+    "        print(f\"Error registering model {model_name}: {str(e)}\")\n",
+    "        continue\n",
+    "\n",
+    "    # Create a Transformer for batch inference\n",
+    "    transformer = Transformer(\n",
+    "        model_name=\"kmeans-k-6\",\n",
+    "        instance_count=1,\n",
+    "        instance_type=\"ml.c5.2xlarge\",\n",
+    "        strategy=\"MultiRecord\",\n",
+    "        output_path=output_path,\n",
+    "        assemble_with=\"Line\",\n",
+    "        accept=\"text/csv\",\n",
+    "        sagemaker_session=sagemaker_session,\n",
+    "        input_filter=\"$[1:]\",\n",
+    "        output_filter=\"$[0,-2,-1]\" \n",
+    "    )\n",
+    "\n",
+    "    # Start the batch transform job\n",
+    "    try:\n",
+    "        transformer.transform(\n",
+    "            data='s3://bdp-test-data/scaled/test-data.csv',\n",
+    "            content_type=\"text/csv\",\n",
+    "            split_type=\"Line\",\n",
+    "            join_source=\"Input\"\n",
+    "        )\n",
+    "        print(f\"Inference for k={k} completed. Results saved to: {output_path}\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error during inference for k={k}: {str(e)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0033e9a4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}