Skip to content

Commit 44d9296

Browse files
committed
Add Kmeans with results
1 parent 49d1586 commit 44d9296

12 files changed

Lines changed: 96716 additions & 0 deletions

scripts/models/kmeans/Benchmark_inference.ipynb

Lines changed: 261 additions & 0 deletions
Large diffs are not rendered by default.

scripts/models/kmeans/Elbow.ipynb

Lines changed: 95506 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import pandas as pd
2+
from sklearn.metrics import classification_report
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
6+
# Load the data
7+
data_file = "test_data.csv.out"
8+
df = pd.read_csv(data_file, header=None, names=["label", "cluster", "distance"])
9+
10+
# Ensure the label column is in boolean format
11+
df["label"] = df["label"].astype(bool)
12+
13+
# Define the range of quantiles to test
14+
quantile_values = [x / 1000 for x in range(500, 1000)] # From 0.500 to 0.999 in steps of 0.001
15+
16+
# Store results for each quantile
17+
results = []
18+
19+
for quantile in quantile_values:
20+
# Calculate the threshold
21+
threshold = df["distance"].quantile(quantile)
22+
23+
# Mark anomalies based on the threshold
24+
df["is_anomaly"] = df["distance"] > threshold
25+
26+
# Generate classification report
27+
report = classification_report(
28+
df["label"], df["is_anomaly"],
29+
target_names=["Normal", "Anomaly"],
30+
labels=[False, True], # Explicitly define the label order
31+
output_dict=True
32+
)
33+
34+
# Extract relevant metrics
35+
macro_avg_recall = report["macro avg"]["recall"]
36+
weighted_avg_recall = report["weighted avg"]["recall"]
37+
38+
# Store results
39+
results.append({
40+
"quantile": quantile,
41+
"threshold": threshold,
42+
"macro_avg_recall": macro_avg_recall,
43+
"weighted_avg_recall": weighted_avg_recall,
44+
"avg_recall": (macro_avg_recall + weighted_avg_recall) / 2,
45+
"classification_report_dict": report
46+
})
47+
48+
# Convert results to a DataFrame
49+
results_df = pd.DataFrame(results)
50+
51+
# Find the best quantile based on the average recall
52+
results_df["avg_recall"] = (results_df["macro_avg_recall"] + results_df["weighted_avg_recall"]) / 2
53+
best_result = results_df.loc[results_df["avg_recall"].idxmax()]
54+
55+
# Extract the best classification report in dictionary format
56+
best_report_dict = best_result["classification_report_dict"]
57+
58+
# Convert the dictionary classification report to a formatted string
59+
df["is_anomaly"] = df["distance"] > best_result["threshold"]
60+
formatted_report = classification_report(
61+
df["label"], df["is_anomaly"], target_names=["Normal", "Anomaly"], labels=[False, True]
62+
)
63+
64+
# Print the detailed report
65+
report_text = f"""
66+
Detailed Report:
67+
68+
Best Quantile: {best_result['quantile']}
69+
Threshold: {best_result['threshold']}
70+
71+
Metrics:
72+
- Macro Avg Recall: {best_result['macro_avg_recall']:.4f}
73+
- Weighted Avg Recall: {best_result['weighted_avg_recall']:.4f}
74+
- Average Recall: {best_result['avg_recall']:.4f}
75+
76+
Classification Report:
77+
{formatted_report}
78+
79+
Full results saved to: quantile_analysis_results.csv
80+
"""
81+
print(report_text)
82+
83+
# Save results to a CSV file
84+
results_df.to_csv("quantile_analysis_results.csv", index=False)
85+
86+
# Generate and save confusion matrix for the best quantile
87+
best_threshold = best_result["threshold"]
88+
conf_matrix = pd.crosstab(df["label"], df["is_anomaly"], rownames=["True"], colnames=["Predicted"], dropna=False)
89+
90+
plt.figure(figsize=(8, 6))
91+
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal", "Anomaly"], yticklabels=["Normal", "Anomaly"])
92+
plt.xlabel("Predicted Label")
93+
plt.ylabel("True Label")
94+
plt.title(f"Confusion Matrix (Quantile {best_result['quantile']:.3f}, Threshold {best_threshold:.2f})")
95+
plt.savefig("best_confusion_matrix.png")
96+
plt.close()
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "7dfeb162",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"name": "stderr",
11+
"output_type": "stream",
12+
"text": [
13+
"/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pydantic/_internal/_fields.py:192: UserWarning: Field name \"json\" in \"MonitoringDatasetFormat\" shadows an attribute in parent \"Base\"\n",
14+
" warnings.warn(\n"
15+
]
16+
},
17+
{
18+
"data": {
19+
"text/html": [
20+
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/24/25 16:55:26] </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO </span> Found credentials from IAM Role: <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1075</span></a>\n",
21+
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> BaseNotebookInstanceEc2InstanceRole <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
22+
"</pre>\n"
23+
],
24+
"text/plain": [
25+
"\u001b[2;36m[01/24/25 16:55:26]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials from IAM Role: \u001b]8;id=580850;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=355709;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\u001b\\\u001b[2m1075\u001b[0m\u001b]8;;\u001b\\\n",
26+
"\u001b[2;36m \u001b[0m BaseNotebookInstanceEc2InstanceRole \u001b[2m \u001b[0m\n"
27+
]
28+
},
29+
"metadata": {},
30+
"output_type": "display_data"
31+
},
32+
{
33+
"name": "stdout",
34+
"output_type": "stream",
35+
"text": [
36+
"sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n",
37+
"sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml\n"
38+
]
39+
},
40+
{
41+
"data": {
42+
"text/html": [
43+
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/24/25 16:55:30] </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO </span> Found credentials from IAM Role: <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1075</span></a>\n",
44+
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> BaseNotebookInstanceEc2InstanceRole <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
45+
"</pre>\n"
46+
],
47+
"text/plain": [
48+
"\u001b[2;36m[01/24/25 16:55:30]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials from IAM Role: \u001b]8;id=735281;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=225806;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\u001b\\\u001b[2m1075\u001b[0m\u001b]8;;\u001b\\\n",
49+
"\u001b[2;36m \u001b[0m BaseNotebookInstanceEc2InstanceRole \u001b[2m \u001b[0m\n"
50+
]
51+
},
52+
"metadata": {},
53+
"output_type": "display_data"
54+
},
55+
{
56+
"data": {
57+
"text/html": [
58+
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO </span> Found credentials from IAM Role: <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">credentials.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1075</span></a>\n",
59+
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> BaseNotebookInstanceEc2InstanceRole <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
60+
"</pre>\n"
61+
],
62+
"text/plain": [
63+
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials from IAM Role: \u001b]8;id=233629;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=672198;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/credentials.py#1075\u001b\\\u001b[2m1075\u001b[0m\u001b]8;;\u001b\\\n",
64+
"\u001b[2;36m \u001b[0m BaseNotebookInstanceEc2InstanceRole \u001b[2m \u001b[0m\n"
65+
]
66+
},
67+
"metadata": {},
68+
"output_type": "display_data"
69+
},
70+
{
71+
"data": {
72+
"text/html": [
73+
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO </span> Same images used for training and inference. Defaulting to image <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">image_uris.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#391\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">391</span></a>\n",
74+
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> scope: inference. <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
75+
"</pre>\n"
76+
],
77+
"text/plain": [
78+
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Same images used for training and inference. Defaulting to image \u001b]8;id=483594;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\u001b\\\u001b[2mimage_uris.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=138965;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#391\u001b\\\u001b[2m391\u001b[0m\u001b]8;;\u001b\\\n",
79+
"\u001b[2;36m \u001b[0m scope: inference. \u001b[2m \u001b[0m\n"
80+
]
81+
},
82+
"metadata": {},
83+
"output_type": "display_data"
84+
},
85+
{
86+
"data": {
87+
"text/html": [
88+
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #0069ff; text-decoration-color: #0069ff; font-weight: bold\">INFO </span> Ignoring unnecessary instance type: <span style=\"color: #e100e1; text-decoration-color: #e100e1; font-style: italic\">None</span>. <a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">image_uris.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#528\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">528</span></a>\n",
89+
"</pre>\n"
90+
],
91+
"text/plain": [
92+
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Ignoring unnecessary instance type: \u001b[3;38;2;225;0;225mNone\u001b[0m. \u001b]8;id=443075;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py\u001b\\\u001b[2mimage_uris.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=3198;file:///home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/image_uris.py#528\u001b\\\u001b[2m528\u001b[0m\u001b]8;;\u001b\\\n"
93+
]
94+
},
95+
"metadata": {},
96+
"output_type": "display_data"
97+
},
98+
{
99+
"name": "stdout",
100+
"output_type": "stream",
101+
"text": [
102+
"Running batch inference for k=6...\n",
103+
"Model data URL for k=6: s3://bdp-models/kmeans/k_6/kmeans-2025-01-22-18-29-22-012/output/model.tar.gz\n",
104+
"Error registering model kmeans-k-6: An error occurred (ValidationException) when calling the CreateModel operation: Cannot create already existing model \"arn:aws:sagemaker:eu-north-1:982534349340:model/kmeans-k-6\".\n"
105+
]
106+
}
107+
],
108+
"source": [
109+
"import boto3\n",
110+
"import sagemaker\n",
111+
"from sagemaker import Session\n",
112+
"from sagemaker.transformer import Transformer\n",
113+
"from botocore.exceptions import ClientError\n",
114+
"\n",
115+
"\n",
116+
"s3_client = boto3.client('s3', region_name=\"eu-north-1\")\n",
117+
"\n",
118+
"def find_model_path(bucket_name, prefix):\n",
119+
" try:\n",
120+
" response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)\n",
121+
" for obj in response.get('Contents', []):\n",
122+
" if obj['Key'].endswith('model.tar.gz'):\n",
123+
" return f\"s3://{bucket_name}/{obj['Key']}\"\n",
124+
" raise FileNotFoundError(f\"No model.tar.gz found under prefix {prefix}\")\n",
125+
" except ClientError as e:\n",
126+
" print(f\"Error listing objects in S3: {e}\")\n",
127+
" raise\n",
128+
"\n",
129+
"# SageMaker and boto3 settings\n",
130+
"region = \"eu-north-1\"\n",
131+
"boto_session = boto3.Session(region_name=region)\n",
132+
"sagemaker_client = boto3.client(\"sagemaker\", region_name=region)\n",
133+
"sagemaker_session = Session(boto_session=boto_session)\n",
134+
"role = sagemaker.get_execution_role()\n",
135+
"image_uri = sagemaker.image_uris.retrieve(\"kmeans\", region=region)\n",
136+
"\n",
137+
"\n",
138+
"k_values = range(6, 7)\n",
139+
"\n",
140+
"for k in k_values:\n",
141+
" print(f\"Running batch inference for k={k}...\")\n",
142+
"\n",
143+
" prefix = f\"kmeans/k_{k}/\"\n",
144+
" model_path = find_model_path(bucket_name=\"bdp-models\", prefix=prefix)\n",
145+
"\n",
146+
" print(f\"Model data URL for k={k}: {model_path}\")\n",
147+
"\n",
148+
" # Define output path\n",
149+
" output_path = f\"s3://bdp-inference-results/kmeans/k_{k}/\"\n",
150+
"\n",
151+
" # Register the SageMaker model\n",
152+
" model_name = f\"kmeans-k-{k}\"\n",
153+
" try:\n",
154+
" sagemaker_client.create_model(\n",
155+
" ModelName=model_name,\n",
156+
" PrimaryContainer={\n",
157+
" \"Image\": image_uri,\n",
158+
" \"ModelDataUrl\": model_path,\n",
159+
" \"Environment\": {},\n",
160+
" },\n",
161+
" ExecutionRoleArn=role,\n",
162+
" )\n",
163+
" print(f\"Model {model_name} registered successfully.\")\n",
164+
" except sagemaker_client.exceptions.ClientError as e:\n",
165+
" print(f\"Error registering model {model_name}: {str(e)}\")\n",
166+
" continue\n",
167+
"\n",
168+
" # Create a Transformer for batch inference\n",
169+
" transformer = Transformer(\n",
170+
" model_name=\"kmeans-k-6\",\n",
171+
" instance_count=1,\n",
172+
" instance_type=\"ml.c5.2xlarge\",\n",
173+
" strategy=\"MultiRecord\",\n",
174+
" output_path=output_path,\n",
175+
" assemble_with=\"Line\",\n",
176+
" accept=\"text/csv\",\n",
177+
" sagemaker_session=sagemaker_session,\n",
178+
" input_filter=\"$[1:]\",\n",
179+
" output_filter=\"$[0,-2,-1]\" \n",
180+
" )\n",
181+
"\n",
182+
" # Start the batch transform job\n",
183+
" try:\n",
184+
" transformer.transform(\n",
185+
" data='s3://bdp-test-data/scaled/test-data.csv',\n",
186+
" content_type=\"text/csv\",\n",
187+
" split_type=\"Line\",\n",
188+
" join_source=\"Input\"\n",
189+
" )\n",
190+
" print(f\"Inference for k={k} completed. Results saved to: {output_path}\")\n",
191+
" except Exception as e:\n",
192+
" print(f\"Error during inference for k={k}: {str(e)}\")\n"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": null,
198+
"id": "0033e9a4",
199+
"metadata": {},
200+
"outputs": [],
201+
"source": []
202+
}
203+
],
204+
"metadata": {
205+
"kernelspec": {
206+
"display_name": "conda_python3",
207+
"language": "python",
208+
"name": "conda_python3"
209+
},
210+
"language_info": {
211+
"codemirror_mode": {
212+
"name": "ipython",
213+
"version": 3
214+
},
215+
"file_extension": ".py",
216+
"mimetype": "text/x-python",
217+
"name": "python",
218+
"nbconvert_exporter": "python",
219+
"pygments_lexer": "ipython3",
220+
"version": "3.10.16"
221+
}
222+
},
223+
"nbformat": 4,
224+
"nbformat_minor": 5
225+
}
28.3 KB
Loading
28.6 KB
Loading
31.3 KB
Loading
30.7 KB
Loading
29.8 KB
Loading

0 commit comments

Comments
 (0)