From 236b32cca3f64bc5749cdf8513d4b305310b8323 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 13:40:53 +0330 Subject: [PATCH 01/27] Fix md file --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cd45cf6..9914192 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,15 @@ AnoGraph and AnoGraph-K detect graph anomalies by first mapping the graph to a h ## Demo -1. To run on DARPA dataset `bash demo.sh DARPA` -2. To run on ISCX dataset `bash demo.sh ISCX` +1. To run on DARPA dataset +``` +bash demo.sh DARPA +``` + +2. To run on ISCX dataset +``` +bash demo.sh ISCX +``` ## Datasets 1. [DARPA](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html) From 7c7369fd9041f7ebe178021ca8a17ce77c844fa3 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 13:45:53 +0330 Subject: [PATCH 02/27] #include --- code/anograph.cpp | 1 + code/anographk.cpp | 1 + code/hcms.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/code/anograph.cpp b/code/anograph.cpp index 89b8593..9237043 100644 --- a/code/anograph.cpp +++ b/code/anograph.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "anograph.hpp" #include "hcms.hpp" diff --git a/code/anographk.cpp b/code/anographk.cpp index 22ff76a..9df85df 100644 --- a/code/anographk.cpp +++ b/code/anographk.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "anographk.hpp" #include "hcms.hpp" diff --git a/code/hcms.cpp b/code/hcms.cpp index 57a45b7..052130e 100644 --- a/code/hcms.cpp +++ b/code/hcms.cpp @@ -1,4 +1,5 @@ #include +#include #include "hcms.hpp" #include "anoedgeglobal.hpp" From 20ff7bf7a78c2d2215e2e5153d15e0dbdf045f09 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 13:47:13 +0330 Subject: [PATCH 03/27] Fix warning --- code/utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/utils.cpp b/code/utils.cpp index 48c2ea7..9bbed8e 100644 --- a/code/utils.cpp +++ b/code/utils.cpp @@ -8,7 +8,7 @@ using namespace std; void WriteUtils::writeScoresAndLabels(vector scores, vector labels, string output_file) { assert (scores.size() == labels.size()); FILE* output_file_ptr = fopen(output_file.c_str(), "w"); - for (int i = 0; i < scores.size(); i++) { + for (unsigned long int i = 0; i < scores.size(); i++) { fprintf(output_file_ptr, "%.4f %d\n", scores[i], labels[i]); } fclose(output_file_ptr); From 05600f13e62a1cf707b8bb6ac2a209bc0eee78ca Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 14:13:51 +0330 Subject: [PATCH 04/27] Fix type --- code/utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/utils.cpp b/code/utils.cpp index 9bbed8e..0ce632b 100644 --- a/code/utils.cpp +++ b/code/utils.cpp @@ -8,7 +8,7 @@ using namespace std; void WriteUtils::writeScoresAndLabels(vector scores, vector labels, string output_file) { assert (scores.size() == labels.size()); FILE* output_file_ptr = fopen(output_file.c_str(), "w"); - for (unsigned long int i = 0; i < scores.size(); i++) { + for (size_t i = 0; i < scores.size(); i++) { fprintf(output_file_ptr, "%.4f %d\n", scores[i], labels[i]); } fclose(output_file_ptr); From e71130cb2d8eff4695e44809ec7cd9911f2f588e Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 14:22:22 +0330 Subject: [PATCH 05/27] add results folder --- results/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 results/.gitkeep diff --git a/results/.gitkeep b/results/.gitkeep new file mode 100644 index 0000000..e69de29 From 14b5bd42af0cfb26c35af5ef48f961d11e6ad1ec Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 14:23:20 +0330 Subject: [PATCH 06/27] add file opening validation --- code/utils.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/code/utils.cpp b/code/utils.cpp index 0ce632b..1324196 100644 --- a/code/utils.cpp +++ b/code/utils.cpp @@ -8,6 +8,13 @@ using namespace std; void WriteUtils::writeScoresAndLabels(vector scores, vector labels, string output_file) { assert (scores.size() == labels.size()); FILE* output_file_ptr = fopen(output_file.c_str(), "w"); + + if (output_file_ptr == NULL) { + cerr << "file path is : " << output_file << endl; + cerr << "Can not open file to write" << endl; + return; + } + for (size_t i = 0; i < scores.size(); i++) { fprintf(output_file_ptr, "%.4f %d\n", scores[i], labels[i]); } From 8789f47ce8cae31a4d8aca0c38eed4a9fca5b449 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 14:38:56 +0330 Subject: [PATCH 07/27] Add installing python dependencies in bash script --- code/demo.sh | 4 ++++ code/requirements.txt | 1 + 2 files changed, 5 insertions(+) create mode 100644 code/requirements.txt diff --git a/code/demo.sh b/code/demo.sh index 16b483b..73c11aa 100644 --- a/code/demo.sh +++ b/code/demo.sh @@ -18,6 +18,10 @@ if [ $1 == "DARPA" ]; then echo "Running AnoEdge-L" ./main anoedge_l DARPA 2 32 0.9 + echo "Installing python dependencies" + pip3 install -r requirements.txt + + echo "Running python metrics" python3 metrics.py --dataset DARPA --time_window 30 --edge_threshold 50 fi diff --git a/code/requirements.txt b/code/requirements.txt new file mode 100644 index 0000000..f5c56c9 --- /dev/null +++ b/code/requirements.txt @@ -0,0 +1 @@ +scikit-learn==1.3.2 From f615a6b332282bb1ebfb8b68a7e211727c6c33e8 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Fri, 26 Jul 2024 14:41:33 +0330 Subject: [PATCH 08/27] add IDEA files --- .idea/.gitignore | 8 ++++++++ .idea/misc.xml | 20 ++++++++++++++++++++ .idea/vcs.xml | 6 ++++++ 3 files changed, 34 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/misc.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..7859412 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,20 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 2f11792e0a55165ad245e23706b60d686eb56436 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Sat, 27 Jul 2024 20:37:00 +0330 Subject: [PATCH 09/27] Fix data type --- code/utils.cpp | 2 +- code/utils.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/code/utils.cpp b/code/utils.cpp index 1324196..9852b3b 100644 --- a/code/utils.cpp +++ b/code/utils.cpp @@ -21,7 +21,7 @@ void WriteUtils::writeScoresAndLabels(vector scores, vector labels, fclose(output_file_ptr); } -void WriteUtils::writeTime(double total_time, int num_records, string output_file) { +void WriteUtils::writeTime(double total_time, size_t num_records, string output_file) { FILE* output_file_ptr = fopen(output_file.c_str(), "w"); fprintf(output_file_ptr, "Average time taken: %.4fs\n", (1.0f*total_time)/num_records); fprintf(output_file_ptr, "Total time taken: %.4fs\n", 1.0f*total_time); diff --git a/code/utils.hpp b/code/utils.hpp index 48f8848..e958a3c 100644 --- a/code/utils.hpp +++ b/code/utils.hpp @@ -15,7 +15,7 @@ const string TIME_FILE_SUFFIX = "_time.csv"; class WriteUtils { public: static void writeScoresAndLabels(vector scores, vector labels, string output_file); - static void writeTime(double total_time, int num_records, string output_file); + static void writeTime(double total_time, size_t num_records, string output_file); }; class ReadUtils { From eed41bd64e2bab49ae84ed4bae99381865f7a932 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Sat, 27 Jul 2024 20:41:39 +0330 Subject: [PATCH 10/27] Fix size_t --- code/anoedgeglobal.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/code/anoedgeglobal.cpp b/code/anoedgeglobal.cpp index 4df3fc4..228794a 100644 --- a/code/anoedgeglobal.cpp +++ b/code/anoedgeglobal.cpp @@ -23,10 +23,10 @@ vector AnoedgeGlobal::getScores() { vector scores; Hcms count(rows, buckets); - int num_records = src.size(); + size_t num_records = src.size(); int last_time = 0; - for (int i = 0; i < num_records; i++) { + for (size_t i = 0; i < num_records; i++) { if (times[i] - last_time > 0) { count.decay(decay_factor); } @@ -51,8 +51,8 @@ void AnoedgeGlobal::run() { } double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int src, int dst) { - int num_rows = mat.size(); - int num_cols = mat[0].size(); + size_t num_rows = mat.size(); + size_t num_cols = mat[0].size(); bool row_flag[num_rows]; bool col_flag[num_cols]; @@ -94,7 +94,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int s double cur_mat_sum = mat[src][dst]; double output = cur_mat_sum/sqrt(marked_rows*marked_cols); - int ctr = num_rows + num_cols - 2; + size_t ctr = num_rows + num_cols - 2; while (ctr--) { if (max_row.second >= max_col.second) { row_flag[max_row.first] = true; From e3d61fef3e2b4c53f604144799bbe1974a66f9b4 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Sun, 28 Jul 2024 06:27:44 +0330 Subject: [PATCH 11/27] add AnoGraph_EDA_nodes.ipynb --- notebooks/AnoGraph_EDA_nodes.ipynb | 3000 ++++++++++++++++++++++++++++ 1 file changed, 3000 insertions(+) create mode 100644 notebooks/AnoGraph_EDA_nodes.ipynb diff --git a/notebooks/AnoGraph_EDA_nodes.ipynb b/notebooks/AnoGraph_EDA_nodes.ipynb new file mode 100644 index 0000000..c08f66e --- /dev/null +++ b/notebooks/AnoGraph_EDA_nodes.ipynb @@ -0,0 +1,3000 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "21Ut0P4sQXrU", + "outputId": "ba909905-f0d4-4ba5-fc2a-027b90cc0a65" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'AnoGraph'...\n", + "remote: Enumerating objects: 105, done.\u001b[K\n", + "remote: Counting objects: 100% (105/105), done.\u001b[K\n", + "remote: Compressing objects: 100% (70/70), done.\u001b[K\n", + "remote: Total 105 (delta 47), reused 77 (delta 31), pack-reused 0\u001b[K\n", + "Receiving objects: 100% (105/105), 5.13 MiB | 2.94 MiB/s, done.\n", + "Resolving deltas: 100% (47/47), done.\n", + "Updating files: 100% (30/30), done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/MKasaei00/AnoGraph.git" + ] + }, + { + "cell_type": "code", + "source": [ + "%cd AnoGraph/code" + ], + "metadata": { + "id": "VOmuROnNQlj-", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1aa27db2-51de-44c1-ad7f-992700ecd821" + }, + "execution_count": 54, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/AnoGraph/code/AnoGraph/code\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Import necessary libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ], + "metadata": { + "id": "RpwDWpYUQtY0", + "collapsed": true + }, + "execution_count": 55, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load the CSV file\n", + "dataset_name = 'DARPA'\n", + "file_path = f\"../data/{dataset_name}/Data.csv\"\n", + "data = pd.read_csv(file_path,header=None, names=['u', 'v', 'time_stamp'])" + ], + "metadata": { + "id": "jWnsVy60Gz5D" + }, + "execution_count": 56, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Convert 'u' and 'v' to integers\n", + "data['u'] = data['u'].astype(int)\n", + "data['v'] = data['v'].astype(int)" + ], + "metadata": { + "id": "CwiZyGNuHEwS" + }, + "execution_count": 57, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 423 + }, + "id": "ujikBoknHxRZ", + "outputId": "2d9318cb-8217-4528-e501-2036ceeb6951" + }, + "execution_count": 58, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " u v time_stamp\n", + "0 7577 9765 1\n", + "1 7577 9765 1\n", + "2 9764 9763 2\n", + "3 9764 9763 2\n", + "4 9765 7577 3\n", + "... ... ... ...\n", + "4554339 10215 9763 46571\n", + "4554340 10215 9763 46571\n", + "4554341 10215 9763 46571\n", + "4554342 10215 9763 46571\n", + "4554343 9763 9764 46572\n", + "\n", + "[4554344 rows x 3 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uvtime_stamp
0757797651
1757797651
2976497632
3976497632
4976575773
............
455433910215976346571
455434010215976346571
455434110215976346571
455434210215976346571
45543439763976446572
\n", + "

4554344 rows × 3 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "data" + } + }, + "metadata": {}, + "execution_count": 58 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Display basic information about the dataset\n", + "data.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5bp9WQINHtA4", + "outputId": "653794ba-1b9d-4bc5-e23e-c31eaa92520e" + }, + "execution_count": 59, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 4554344 entries, 0 to 4554343\n", + "Data columns (total 3 columns):\n", + " # Column Dtype\n", + "--- ------ -----\n", + " 0 u int64\n", + " 1 v int64\n", + " 2 time_stamp int64\n", + "dtypes: int64(3)\n", + "memory usage: 104.2 MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Descriptive statistics for u, v, and time_stamp\n", + "data.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "gkBTfCQ2H1D_", + "outputId": "ae1ddcd5-778d-481c-e7eb-c44bcb1cb0d7" + }, + "execution_count": 60, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " u v time_stamp\n", + "count 4.554344e+06 4.554344e+06 4.554344e+06\n", + "mean 9.716706e+03 9.768405e+03 2.794812e+04\n", + "std 8.326770e+03 4.535230e+03 1.160524e+04\n", + "min 3.000000e+00 0.000000e+00 1.000000e+00\n", + "25% 2.538000e+03 7.607000e+03 2.236800e+04\n", + "50% 8.409000e+03 7.863000e+03 2.794500e+04\n", + "75% 1.139400e+04 1.013900e+04 3.785600e+04\n", + "max 2.552400e+04 2.552400e+04 4.657200e+04" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uvtime_stamp
count4.554344e+064.554344e+064.554344e+06
mean9.716706e+039.768405e+032.794812e+04
std8.326770e+034.535230e+031.160524e+04
min3.000000e+000.000000e+001.000000e+00
25%2.538000e+037.607000e+032.236800e+04
50%8.409000e+037.863000e+032.794500e+04
75%1.139400e+041.013900e+043.785600e+04
max2.552400e+042.552400e+044.657200e+04
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"data\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"u\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1606892.581859376,\n \"min\": 3.0,\n \"max\": 4554344.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 9716.70583205836,\n 8409.0,\n 4554344.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"v\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1606915.5274845983,\n \"min\": 0.0,\n \"max\": 4554344.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 9768.405486937307,\n 7863.0,\n 4554344.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_stamp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1601465.603446348,\n \"min\": 1.0,\n \"max\": 4554344.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 27948.12267430831,\n 27945.0,\n 4554344.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 60 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Calculate additional statistics\n", + "percentiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]\n", + "additional_stats = data.describe(percentiles=percentiles)\n", + "additional_stats" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 457 + }, + "id": "p-590scHIESw", + "outputId": "9d0955b7-1744-47d2-ec12-df46daeea96e" + }, + "execution_count": 61, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " u v time_stamp\n", + "count 4.554344e+06 4.554344e+06 4.554344e+06\n", + "mean 9.716706e+03 9.768405e+03 2.794812e+04\n", + "std 8.326770e+03 4.535230e+03 1.160524e+04\n", + "min 3.000000e+00 0.000000e+00 1.000000e+00\n", + "10% 2.330000e+02 7.607000e+03 1.165400e+04\n", + "25% 2.538000e+03 7.607000e+03 2.236800e+04\n", + "50% 8.409000e+03 7.863000e+03 2.794500e+04\n", + "75% 1.139400e+04 1.013900e+04 3.785600e+04\n", + "90% 2.552300e+04 1.816400e+04 4.306300e+04\n", + "95% 2.552300e+04 2.054300e+04 4.429300e+04\n", + "99% 2.552300e+04 2.401500e+04 4.577300e+04\n", + "99.9% 2.552300e+04 2.544200e+04 4.647500e+04\n", + "max 2.552400e+04 2.552400e+04 4.657200e+04" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uvtime_stamp
count4.554344e+064.554344e+064.554344e+06
mean9.716706e+039.768405e+032.794812e+04
std8.326770e+034.535230e+031.160524e+04
min3.000000e+000.000000e+001.000000e+00
10%2.330000e+027.607000e+031.165400e+04
25%2.538000e+037.607000e+032.236800e+04
50%8.409000e+037.863000e+032.794500e+04
75%1.139400e+041.013900e+043.785600e+04
90%2.552300e+041.816400e+044.306300e+04
95%2.552300e+042.054300e+044.429300e+04
99%2.552300e+042.401500e+044.577300e+04
99.9%2.552300e+042.544200e+044.647500e+04
max2.552400e+042.552400e+044.657200e+04
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "additional_stats", + "summary": "{\n \"name\": \"additional_stats\",\n \"rows\": 13,\n \"fields\": [\n {\n \"column\": \"u\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1259301.3872468325,\n \"min\": 3.0,\n \"max\": 4554344.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 25523.0,\n 9716.70583205836,\n 2538.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"v\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1259450.2323463897,\n \"min\": 0.0,\n \"max\": 4554344.0,\n \"num_unique_values\": 12,\n \"samples\": [\n 25442.0,\n 24015.0,\n 4554344.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_stamp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1254794.221144536,\n \"min\": 1.0,\n \"max\": 4554344.0,\n \"num_unique_values\": 13,\n \"samples\": [\n 46475.0,\n 44293.0,\n 4554344.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 61 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Most repeated values\n", + "\n", + "# Display most repeated values\n", + "most_repeated_values_u = data['u'].value_counts().head()\n", + "most_repeated_values_v = data['v'].value_counts().head()\n", + "most_repeated_values_time = data['time_stamp'].value_counts().head()\n", + "\n", + "# Convert the most repeated values to dataframes for better display\n", + "most_repeated_u_df = pd.DataFrame(most_repeated_values_u).reset_index()\n", + "most_repeated_u_df.columns = ['Value', 'Count']\n", + "\n", + "most_repeated_v_df = pd.DataFrame(most_repeated_values_v).reset_index()\n", + "most_repeated_v_df.columns = ['Value', 'Count']\n", + "\n", + "most_repeated_time_df = pd.DataFrame(most_repeated_values_time).reset_index()\n", + "most_repeated_time_df.columns = ['Value', 'Count']" + ], + "metadata": { + "cellView": "form", + "id": "0ZI4OF7ZIWpv" + }, + "execution_count": 62, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "display(most_repeated_u_df)\n", + "display(most_repeated_v_df)\n", + "display(most_repeated_time_df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 584 + }, + "id": "POs5BkstI8Q_", + "outputId": "8d3d27b3-e549-4f7e-94bc-7a7ff03215b0" + }, + "execution_count": 63, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " Value Count\n", + "0 233 1024185\n", + "1 25523 499613\n", + "2 9763 281555\n", + "3 10215 250796\n", + "4 2538 224542" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCount
02331024185
125523499613
29763281555
310215250796
42538224542
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "most_repeated_u_df", + "summary": "{\n \"name\": \"most_repeated_u_df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9893,\n \"min\": 233,\n \"max\": 25523,\n \"num_unique_values\": 5,\n \"samples\": [\n 25523,\n 2538,\n 9763\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 335726,\n \"min\": 224542,\n \"max\": 1024185,\n \"num_unique_values\": 5,\n \"samples\": [\n 499613,\n 224542,\n 281555\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " Value Count\n", + "0 7607 1639430\n", + "1 7863 646164\n", + "2 9763 347190\n", + "3 8117 233892\n", + "4 10215 201902" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCount
076071639430
17863646164
29763347190
38117233892
410215201902
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "most_repeated_v_df", + "summary": "{\n \"name\": \"most_repeated_v_df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1189,\n \"min\": 7607,\n \"max\": 10215,\n \"num_unique_values\": 5,\n \"samples\": [\n 7863,\n 10215,\n 9763\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 599590,\n \"min\": 201902,\n \"max\": 1639430,\n \"num_unique_values\": 5,\n \"samples\": [\n 646164,\n 201902,\n 347190\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " Value Count\n", + "0 11654 65009\n", + "1 14975 64169\n", + "2 14974 26548\n", + "3 14976 22054\n", + "4 27963 13252" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCount
01165465009
11497564169
21497426548
31497622054
42796313252
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "most_repeated_time_df", + "summary": "{\n \"name\": \"most_repeated_time_df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6344,\n \"min\": 11654,\n \"max\": 27963,\n \"num_unique_values\": 5,\n \"samples\": [\n 14975,\n 27963,\n 14974\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 24555,\n \"min\": 13252,\n \"max\": 65009,\n \"num_unique_values\": 5,\n \"samples\": [\n 64169,\n 13252,\n 26548\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Scatter plot for u(source nodes) , v(destination nodes)\n", + "\n", + "from matplotlib import pyplot as plt\n", + "\n", + "# Create the scatter plot\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot data from most_repeated_u_df in blue\n", + "plt.scatter(most_repeated_u_df['Value'], most_repeated_u_df['Count'], color='blue', s=32, alpha=0.8, label='U Data')\n", + "\n", + "# Plot data from most_repeated_v_df in orange\n", + "plt.scatter(most_repeated_v_df['Value'], most_repeated_v_df['Count'], color='orange', s=32, alpha=0.8, label='V Data')\n", + "\n", + "# Customize the plot\n", + "plt.xlabel('Value')\n", + "plt.ylabel('Count')\n", + "plt.title('Value vs Count of Source Node')\n", + "plt.legend()\n", + "plt.gca().spines[['top', 'right']].set_visible(False)\n", + "\n", + "# Show the plot\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 564 + }, + "cellView": "form", + "id": "rRMEIbynLyD8", + "outputId": "b1f18d86-d561-4311-9a5e-a746169fc323" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Visualizing Source and Destination Node Distributions with Multi-Bar Histograms\n", + "\n", + "def plot_multi_bar_distribution(u_column, v_column, bins=50):\n", + " plt.figure(figsize=(14, 8))\n", + "\n", + " u_counts, u_bins = np.histogram(u_column, bins=bins)\n", + " v_counts, v_bins = np.histogram(v_column, bins=bins)\n", + "\n", + " width = (u_bins[1] - u_bins[0]) / 3\n", + " plt.bar(u_bins[:-1] - width/2, u_counts, width=width, color='blue', alpha=0.7, label='u')\n", + " plt.bar(v_bins[:-1] + width/2, v_counts, width=width, color='orange', alpha=0.7, label='v')\n", + "\n", + " plt.title('Distribution of u and v')\n", + " plt.xlabel('Value')\n", + " plt.ylabel('Frequency')\n", + " plt.legend()\n", + " plt.grid(True)\n", + " plt.show()\n", + "\n", + "# Plot multi-bar distributions for u and v\n", + "plot_multi_bar_distribution(data['u'], data['v'])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 718 + }, + "id": "uRg2cjKKLQaF", + "outputId": "9a0c7f95-97cf-46a5-deb1-5c75fa2548ff" + }, + "execution_count": 65, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "data['time_stamp']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-b0wmfUiM-YN", + "outputId": "dda9e873-c051-4dff-fdfc-c13e6bfb7f2f" + }, + "execution_count": 66, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 1\n", + "1 1\n", + "2 2\n", + "3 2\n", + "4 3\n", + " ... \n", + "4554339 46571\n", + "4554340 46571\n", + "4554341 46571\n", + "4554342 46571\n", + "4554343 46572\n", + "Name: time_stamp, Length: 4554344, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 66 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Analysis of the graph's final state\n" + ], + "metadata": { + "id": "R4Tc5SsgNvja" + } + }, + { + "cell_type": "code", + "source": [ + "# @title Calculate in-degree and out-degree for every node\n", + "in_degree = data['v'].value_counts()\n", + "out_degree = data['u'].value_counts()\n", + "degree_df['total_degree'] = degree_df['in_degree'] + degree_df['out_degree']" + ], + "metadata": { + "id": "pzfxUpBCNw56" + }, + "execution_count": 67, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Create a DataFrame for degrees\n", + "degree_df = pd.DataFrame({\n", + " 'node': list(set(data['u']).union(set(data['v']))),\n", + " 'in_degree': in_degree,\n", + " 'out_degree': out_degree\n", + "}).fillna(0).astype(int)\n", + "\n", + "# Add a new column that is the sum of in-degree and out-degree\n", + "degree_df['total_degree'] = degree_df['in_degree'] + degree_df['out_degree']\n", + "\n", + "# Display the new DataFrame\n", + "display(degree_df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 423 + }, + "id": "kHXqyniFN3fq", + "outputId": "cc362a14-0e4b-42c7-e1d7-b1a61ae1231d" + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " node in_degree out_degree total_degree\n", + "0 0 21 0 21\n", + "1 1 21 0 21\n", + "2 2 84 0 84\n", + "3 3 0 777 777\n", + "4 4 0 1 1\n", + "... ... ... ... ...\n", + "25520 25520 36 0 36\n", + "25521 25521 99 0 99\n", + "25522 25522 0 1001 1001\n", + "25523 25523 0 499613 499613\n", + "25524 25524 1809 1 1810\n", + "\n", + "[25525 rows x 4 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodein_degreeout_degreetotal_degree
0021021
1121021
2284084
330777777
44011
...............
255202552036036
255212552199099
2552225522010011001
25523255230499613499613
2552425524180911810
\n", + "

25525 rows × 4 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "degree_df", + "summary": "{\n \"name\": \"degree_df\",\n \"rows\": 25525,\n \"fields\": [\n {\n \"column\": \"node\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7368,\n \"min\": 0,\n \"max\": 25524,\n \"num_unique_values\": 25525,\n \"samples\": [\n 15191,\n 3820,\n 24525\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"in_degree\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11439,\n \"min\": 0,\n \"max\": 1639430,\n \"num_unique_values\": 681,\n \"samples\": [\n 1873,\n 703,\n 109\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"out_degree\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8059,\n \"min\": 0,\n \"max\": 1024185,\n \"num_unique_values\": 159,\n \"samples\": [\n 23455,\n 5225,\n 597\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_degree\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14535,\n \"min\": 1,\n \"max\": 1649547,\n \"num_unique_values\": 749,\n \"samples\": [\n 5702,\n 885,\n 274\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Find the top k nodes by total degree\n", + "k = 20\n", + "top_k_nodes = degree_df.nlargest(k, 'total_degree')\n", + "\n", + "# Plot multi-bar chart\n", + "plt.figure(figsize=(14, 8))\n", + "width = 0.35\n", + "x = np.arange(len(top_k_nodes['node'].astype(str)))\n", + "\n", + "plt.bar(x - width/2, top_k_nodes['in_degree'], width, label='In-Degree', color='blue')\n", + "plt.bar(x + width/2, top_k_nodes['out_degree'], width, label='Out-Degree', color='orange')\n", + "\n", + "plt.xlabel('Node')\n", + "plt.ylabel('Degree')\n", + "plt.title(f'Top {k} Nodes by Total Degree')\n", + "plt.xticks(x, top_k_nodes['node'].astype(str), rotation=90)\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 748 + }, + "id": "EQqA9_UaQXoE", + "outputId": "ce370a3c-2b39-4a5e-f29f-c539a83b6027" + }, + "execution_count": 69, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "3eDRjVjKQ1Ll" + }, + "execution_count": 69, + "outputs": [] + } + ] +} \ No newline at end of file From ea85dc4d29ee24fd19ae85e9b9974127dbab28ea Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 20:24:55 +0330 Subject: [PATCH 12/27] Add .gitignore --- .gitignore | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0a5675b --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# cpp object files +*.o + +# results files +results/*.csv + +# preprocessed data labels +data/*/Label_*_*.csv + +# main executable file +main \ No newline at end of file From cee10929f5643cb72433b91adf41039bd9358b83 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 20:31:25 +0330 Subject: [PATCH 13/27] Fix minor size_t types --- code/anoedgeglobal.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/code/anoedgeglobal.cpp b/code/anoedgeglobal.cpp index 228794a..1e278aa 100644 --- a/code/anoedgeglobal.cpp +++ b/code/anoedgeglobal.cpp @@ -60,11 +60,11 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int s double row_slice_sum[num_rows]; double col_slice_sum[num_cols]; - for (int i = 0; i < num_rows; i++) { + for (size_t i = 0; i < num_rows; i++) { row_flag[i] = false; row_slice_sum[i] = mat[i][dst]; } - for (int i = 0; i < num_cols; i++) { + for (size_t i = 0; i < num_cols; i++) { col_flag[i] = false; col_slice_sum[i] = mat[src][i]; } @@ -75,14 +75,14 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int s col_slice_sum[dst] = mat[src][dst]; pair max_row = {-1, -1.0}; - for (int i = 0; i < num_rows; i++) { + for (size_t i = 0; i < num_rows; i++) { if (!row_flag[i] && (row_slice_sum[i] >= max_row.second)) { max_row = {i, row_slice_sum[i]}; } } pair max_col = {-1, -1.0}; - for (int i = 0; i < num_cols; i++) { + for (size_t i = 0; i < num_cols; i++) { if (!col_flag[i] && (col_slice_sum[i] >= max_col.second)) { max_col = {i, col_slice_sum[i]}; } @@ -101,7 +101,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int s marked_rows++; max_col = {-1, -1.0}; - for (int i = 0; i < num_cols; i++) { + for (size_t i = 0; i < num_cols; i++) { if (col_flag[i]) { cur_mat_sum = cur_mat_sum + mat[max_row.first][i]; } else { @@ -113,7 +113,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int s } max_row = {-1, -1.0}; - for (int i = 0; i < num_rows; i++) { + for (size_t i = 0; i < num_rows; i++) { if (!row_flag[i] && (row_slice_sum[i] >= max_row.second)) { max_row = {i, row_slice_sum[i]}; } @@ -123,7 +123,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int s marked_cols++; max_row = {-1, -1.0}; - for (int i = 0; i < num_rows; i++) { + for (size_t i = 0; i < num_rows; i++) { if (row_flag[i]) { cur_mat_sum = cur_mat_sum + mat[i][max_col.first]; } else { @@ -135,7 +135,7 @@ double AnoedgeGlobal::getAnoedgeglobalDensity(vector>& mat, int s } max_col = {-1, -1.0}; - for (int i = 0; i < num_cols; i++) { + for (size_t i = 0; i < num_cols; i++) { if (!col_flag[i] && (col_slice_sum[i] >= max_col.second)) { max_col = {i, col_slice_sum[i]}; } From 9281682f99dd500cb4363e4621363c1ba61666a6 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 20:43:23 +0330 Subject: [PATCH 14/27] Some refactoring in python code --- code/metrics.py | 65 +++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/code/metrics.py b/code/metrics.py index 2309ade..0ec859f 100644 --- a/code/metrics.py +++ b/code/metrics.py @@ -1,6 +1,8 @@ import pandas as pd import argparse from sklearn import metrics + + parser = argparse.ArgumentParser() parser.add_argument('--dataset', default='DARPA') parser.add_argument('--time_window', type=int, default=30) @@ -15,9 +17,9 @@ def print_anoedge_auc_time(base_path, dataset_name, algorithm): fpr, tpr, _ = metrics.roc_curve(data.label, data.score) auc = metrics.roc_auc_score(data.label, data.score) - print ("%s,%s" % (algorithm, dataset_name)) - print ("AUC: %.3f" % (auc)) - print ("Time: %s\n" % (time_values["total"].iloc[1])) + print("%s,%s" % (algorithm, dataset_name)) + print("AUC: %.3f" % (auc)) + print("Time: %s\n" % (time_values["total"].iloc[1])) def print_anograph_auc_time(base_path, dataset_name, time_window, edge_threshold, algorithm): data = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_score.csv", names=['score', 'label'], sep=" ") @@ -26,36 +28,45 @@ def print_anograph_auc_time(base_path, dataset_name, time_window, edge_threshold fpr, tpr, _ = metrics.roc_curve(data.label, data.score) auc = metrics.roc_auc_score(data.label, data.score) - print ("%s,%s" % (algorithm, dataset_name)) - print ("AUC: %.3f" % (auc)) - print ("Time: %s\n" % (time_values["total"].iloc[1])) + print("%s,%s" % (algorithm, dataset_name)) + print("AUC: %.3f" % auc) + print("Time: %s\n" % (time_values["total"].iloc[1])) -if __name__ == "__main__": - if args.dataset == 'DARPA': - print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph_k") +def run_darpa(): + print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "DARPA", "anoedge_g") + print_anoedge_auc_time("../results/", "DARPA", "anoedge_l") - print_anoedge_auc_time("../results/", "DARPA", "anoedge_g") - print_anoedge_auc_time("../results/", "DARPA", "anoedge_l") - if args.dataset == 'ISCX': - print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph_k") +def run_iscx(): + print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "ISCX", "anoedge_g") + print_anoedge_auc_time("../results/", "ISCX", "anoedge_l") - print_anoedge_auc_time("../results/", "ISCX", "anoedge_g") - print_anoedge_auc_time("../results/", "ISCX", "anoedge_l") - if args.dataset == 'IDS2018': - print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph_k") +def run_ids2018(): + print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "IDS2018", "anoedge_g") + print_anoedge_auc_time("../results/", "IDS2018", "anoedge_l") - print_anoedge_auc_time("../results/", "IDS2018", "anoedge_g") - print_anoedge_auc_time("../results/", "IDS2018", "anoedge_l") - if args.dataset == 'DDOS2019': - print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph_k") +def run_ddos2019(): + print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_g") + print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_l") - print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_g") - print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_l") \ No newline at end of file + +if __name__ == "__main__": + if args.dataset == 'DARPA': + run_darpa() + elif args.dataset == 'ISCX': + run_iscx() + elif args.dataset == 'IDS2018': + run_ids2018() + elif args.dataset == 'DDOS2019': + run_ddos2019() \ No newline at end of file From 9c440a62e5dc8851d9f932dcf06bc46961dfca8f Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 20:46:14 +0330 Subject: [PATCH 15/27] reformat python code --- code/metrics.py | 82 ++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/code/metrics.py b/code/metrics.py index 0ec859f..d6dd2a5 100644 --- a/code/metrics.py +++ b/code/metrics.py @@ -2,7 +2,6 @@ import argparse from sklearn import metrics - parser = argparse.ArgumentParser() parser.add_argument('--dataset', default='DARPA') parser.add_argument('--time_window', type=int, default=30) @@ -11,62 +10,67 @@ def print_anoedge_auc_time(base_path, dataset_name, algorithm): - data = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_score.csv", names=['score', 'label'], sep=" ") - time_values = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_time.csv", names=['avg', 'total'], sep=" ") + data = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_score.csv", names=['score', 'label'], sep=" ") + time_values = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_time.csv", names=['avg', 'total'], sep=" ") + + fpr, tpr, _ = metrics.roc_curve(data.label, data.score) + auc = metrics.roc_auc_score(data.label, data.score) - fpr, tpr, _ = metrics.roc_curve(data.label, data.score) - auc = metrics.roc_auc_score(data.label, data.score) + print("%s,%s" % (algorithm, dataset_name)) + print("AUC: %.3f" % (auc)) + print("Time: %s\n" % (time_values["total"].iloc[1])) - print("%s,%s" % (algorithm, dataset_name)) - print("AUC: %.3f" % (auc)) - print("Time: %s\n" % (time_values["total"].iloc[1])) def print_anograph_auc_time(base_path, dataset_name, time_window, edge_threshold, algorithm): - data = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_score.csv", names=['score', 'label'], sep=" ") - time_values = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_time.csv", names=['avg', 'total'], sep=" ") + data = pd.read_csv( + base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_score.csv", + names=['score', 'label'], sep=" ") + time_values = pd.read_csv( + base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_time.csv", + names=['avg', 'total'], sep=" ") - fpr, tpr, _ = metrics.roc_curve(data.label, data.score) - auc = metrics.roc_auc_score(data.label, data.score) + fpr, tpr, _ = metrics.roc_curve(data.label, data.score) + auc = metrics.roc_auc_score(data.label, data.score) - print("%s,%s" % (algorithm, dataset_name)) - print("AUC: %.3f" % auc) - print("Time: %s\n" % (time_values["total"].iloc[1])) + print("%s,%s" % (algorithm, dataset_name)) + print("AUC: %.3f" % auc) + print("Time: %s\n" % (time_values["total"].iloc[1])) def run_darpa(): - print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "DARPA", "anoedge_g") - print_anoedge_auc_time("../results/", "DARPA", "anoedge_l") + print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "DARPA", "anoedge_g") + print_anoedge_auc_time("../results/", "DARPA", "anoedge_l") def run_iscx(): - print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "ISCX", "anoedge_g") - print_anoedge_auc_time("../results/", "ISCX", "anoedge_l") + print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "ISCX", "anoedge_g") + print_anoedge_auc_time("../results/", "ISCX", "anoedge_l") def run_ids2018(): - print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "IDS2018", "anoedge_g") - print_anoedge_auc_time("../results/", "IDS2018", "anoedge_l") + print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "IDS2018", "anoedge_g") + print_anoedge_auc_time("../results/", "IDS2018", "anoedge_l") def run_ddos2019(): - print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_g") - print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_l") + print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_g") + print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_l") if __name__ == "__main__": - if args.dataset == 'DARPA': - run_darpa() - elif args.dataset == 'ISCX': - run_iscx() - elif args.dataset == 'IDS2018': - run_ids2018() - elif args.dataset == 'DDOS2019': - run_ddos2019() \ No newline at end of file + if args.dataset == 'DARPA': + run_darpa() + elif args.dataset == 'ISCX': + run_iscx() + elif args.dataset == 'IDS2018': + run_ids2018() + elif args.dataset == 'DDOS2019': + run_ddos2019() From c9bf30d009c45f2ffb2b36e31c66160d103105c8 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 20:46:52 +0330 Subject: [PATCH 16/27] better python code :) --- code/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/metrics.py b/code/metrics.py index d6dd2a5..6746e17 100644 --- a/code/metrics.py +++ b/code/metrics.py @@ -17,7 +17,7 @@ def print_anoedge_auc_time(base_path, dataset_name, algorithm): auc = metrics.roc_auc_score(data.label, data.score) print("%s,%s" % (algorithm, dataset_name)) - print("AUC: %.3f" % (auc)) + print("AUC: %.3f" % auc) print("Time: %s\n" % (time_values["total"].iloc[1])) From 24234d2ae6b633122b54f901a0088aa400e97ffe Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 20:50:28 +0330 Subject: [PATCH 17/27] refactoring python code --- code/metrics.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/code/metrics.py b/code/metrics.py index 6746e17..4b48baf 100644 --- a/code/metrics.py +++ b/code/metrics.py @@ -10,8 +10,9 @@ def print_anoedge_auc_time(base_path, dataset_name, algorithm): - data = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_score.csv", names=['score', 'label'], sep=" ") - time_values = pd.read_csv(base_path + algorithm + "_" + dataset_name + "_time.csv", names=['avg', 'total'], sep=" ") + file_name = base_path + algorithm + "_" + dataset_name + data = pd.read_csv(file_name + "_score.csv", names=['score', 'label'], sep=" ") + time_values = pd.read_csv(file_name + "_time.csv", names=['avg', 'total'], sep=" ") fpr, tpr, _ = metrics.roc_curve(data.label, data.score) auc = metrics.roc_auc_score(data.label, data.score) @@ -22,12 +23,9 @@ def print_anoedge_auc_time(base_path, dataset_name, algorithm): def print_anograph_auc_time(base_path, dataset_name, time_window, edge_threshold, algorithm): - data = pd.read_csv( - base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_score.csv", - names=['score', 'label'], sep=" ") - time_values = pd.read_csv( - base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + "_time.csv", - names=['avg', 'total'], sep=" ") + file_name = base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + data = pd.read_csv(file_name + "_score.csv", names=['score', 'label'], sep=" ") + time_values = pd.read_csv(file_name + "_time.csv", names=['avg', 'total'], sep=" ") fpr, tpr, _ = metrics.roc_curve(data.label, data.score) auc = metrics.roc_auc_score(data.label, data.score) From b908d51f0d1e092bed23bccd2a8d13e2e10f327d Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 21:29:46 +0330 Subject: [PATCH 18/27] Change .gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0a5675b..4d4566f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,7 @@ results/*.csv data/*/Label_*_*.csv # main executable file -main \ No newline at end of file +main + +# idea +.idea \ No newline at end of file From d1036cda4312ef62afa34cbf79db17acf23f2d0d Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 21:30:37 +0330 Subject: [PATCH 19/27] refactor python --- code/metrics.py | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/code/metrics.py b/code/metrics.py index 4b48baf..ba01aaa 100644 --- a/code/metrics.py +++ b/code/metrics.py @@ -8,6 +8,8 @@ parser.add_argument("--edge_threshold", type=int, default=50) args = parser.parse_args() +results_path = "../results/" + def print_anoedge_auc_time(base_path, dataset_name, algorithm): file_name = base_path + algorithm + "_" + dataset_name @@ -35,40 +37,16 @@ def print_anograph_auc_time(base_path, dataset_name, time_window, edge_threshold print("Time: %s\n" % (time_values["total"].iloc[1])) -def run_darpa(): - print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "DARPA", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "DARPA", "anoedge_g") - print_anoedge_auc_time("../results/", "DARPA", "anoedge_l") - - -def run_iscx(): - print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "ISCX", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "ISCX", "anoedge_g") - print_anoedge_auc_time("../results/", "ISCX", "anoedge_l") - - -def run_ids2018(): - print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "IDS2018", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "IDS2018", "anoedge_g") - print_anoedge_auc_time("../results/", "IDS2018", "anoedge_l") - - -def run_ddos2019(): - print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph") - print_anograph_auc_time("../results/", "DDOS2019", args.time_window, args.edge_threshold, "anograph_k") - print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_g") - print_anoedge_auc_time("../results/", "DDOS2019", "anoedge_l") +def run_with_dataset(dataset_name): + print_anograph_auc_time(results_path, dataset_name, args.time_window, args.edge_threshold, "anograph") + print_anograph_auc_time(results_path, dataset_name, args.time_window, args.edge_threshold, "anograph_k") + print_anoedge_auc_time(results_path, dataset_name, "anoedge_g") + print_anoedge_auc_time(results_path, dataset_name, "anoedge_l") if __name__ == "__main__": - if args.dataset == 'DARPA': - run_darpa() - elif args.dataset == 'ISCX': - run_iscx() - elif args.dataset == 'IDS2018': - run_ids2018() - elif args.dataset == 'DDOS2019': - run_ddos2019() + datasets = ["DARPA", "ISCX", "IDS2018", "DDOS2019"] + if args.dataset in datasets: + run_with_dataset(args.dataset) + else: + print(f"Could not detect dataset {args.dataset}") From 367a9458265468320f2ba2a95b5c543f9de0b0cf Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 21:35:04 +0330 Subject: [PATCH 20/27] extract print_auc_time --- code/metrics.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/code/metrics.py b/code/metrics.py index ba01aaa..ea0560a 100644 --- a/code/metrics.py +++ b/code/metrics.py @@ -13,19 +13,15 @@ def print_anoedge_auc_time(base_path, dataset_name, algorithm): file_name = base_path + algorithm + "_" + dataset_name - data = pd.read_csv(file_name + "_score.csv", names=['score', 'label'], sep=" ") - time_values = pd.read_csv(file_name + "_time.csv", names=['avg', 'total'], sep=" ") - - fpr, tpr, _ = metrics.roc_curve(data.label, data.score) - auc = metrics.roc_auc_score(data.label, data.score) - - print("%s,%s" % (algorithm, dataset_name)) - print("AUC: %.3f" % auc) - print("Time: %s\n" % (time_values["total"].iloc[1])) + print_auc_time(algorithm, dataset_name, file_name) def print_anograph_auc_time(base_path, dataset_name, time_window, edge_threshold, algorithm): file_name = base_path + algorithm + "_" + dataset_name + "_" + str(time_window) + "_" + str(edge_threshold) + print_auc_time(algorithm, dataset_name, file_name) + + +def print_auc_time(algorithm, dataset_name, file_name): data = pd.read_csv(file_name + "_score.csv", names=['score', 'label'], sep=" ") time_values = pd.read_csv(file_name + "_time.csv", names=['avg', 'total'], sep=" ") From 3553c02d1db5f72235bab037437b65dad6669f34 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Mon, 19 Aug 2024 21:44:13 +0330 Subject: [PATCH 21/27] refactor process_data.py --- code/process_data.py | 82 +++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/code/process_data.py b/code/process_data.py index 4ae62cb..fa9a9f8 100644 --- a/code/process_data.py +++ b/code/process_data.py @@ -4,39 +4,49 @@ from numpy import savetxt -def process_dataset(base_path, dataset_name, time_param, edge_thershold): - records = [] - with open(base_path + dataset_name + "/Data.csv", "r") as f: - for line in f: - if len(line) <= 1: - continue - src, dst, time = line.split("\n")[0].split(",") - records.append((int(src), int(dst), int(time))) - - labels = [] - with open(base_path + dataset_name + "/Label.csv", "r") as f: - for line in f: - if len(line) <= 1: - continue - label = line.split("\n")[0] - labels.append(int(label)) - - assert len(records) == len(labels) - - record_labels = [(record[0], record[1], record[2], label) for record, label in zip(records, labels)] - - write_format = str(time_param) + "_" + str(edge_thershold) - - data = pd.DataFrame(np.array(record_labels)) - - labels = [] - data[2] = (data[2]/time_param).astype(int) - for i in pd.unique(data[2]): - labels.append(sum(data[data[2]==i][3])) - - labels = np.array(labels) - labels = labels >= edge_thershold - labels = labels * 1 - savetxt(base_path + dataset_name + "/Label_" + write_format + ".csv", labels, delimiter='\n',fmt="%d") - -process_dataset("../data/", str(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])) + +def process_dataset(base_path, dataset_name, time_param, edge_threshold): + records = [] + with open(base_path + dataset_name + "/Data.csv", "r") as f: + for line in f: + if len(line) <= 1: + continue + src, dst, time = line.split("\n")[0].split(",") + records.append((int(src), int(dst), int(time))) + + labels = [] + with open(base_path + dataset_name + "/Label.csv", "r") as f: + for line in f: + if len(line) <= 1: + continue + label = line.split("\n")[0] + labels.append(int(label)) + + assert len(records) == len(labels) + + record_labels = [(record[0], record[1], record[2], label) for record, label in zip(records, labels)] + + write_format = str(time_param) + "_" + str(edge_threshold) + + data = pd.DataFrame(np.array(record_labels)) + + labels = [] + data[2] = (data[2] / time_param).astype(int) + for i in pd.unique(data[2]): + labels.append(sum(data[data[2] == i][3])) + + labels = np.array(labels) + labels = labels >= edge_threshold + labels = labels * 1 + savetxt(base_path + dataset_name + "/Label_" + write_format + ".csv", labels, delimiter='\n', fmt="%d") + + +def main(): + dataset_name = str(sys.argv[1]) + time_param = int(sys.argv[2]) + edge_threshold = int(sys.argv[3]) + process_dataset("../data/", dataset_name, time_param, edge_threshold) + + +if __name__ == "__main__": + main() From 560bac841ab48af887832f2ae51e72cc9886f900 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Tue, 20 Aug 2024 17:00:18 +0330 Subject: [PATCH 22/27] use os.path.join --- code/process_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/code/process_data.py b/code/process_data.py index fa9a9f8..88ee80a 100644 --- a/code/process_data.py +++ b/code/process_data.py @@ -1,13 +1,15 @@ import pandas as pd import numpy as np import sys +import os from numpy import savetxt def process_dataset(base_path, dataset_name, time_param, edge_threshold): records = [] - with open(base_path + dataset_name + "/Data.csv", "r") as f: + data_path = os.path.join(base_path, dataset_name, "Data.csv") + with open(data_path, "r") as f: for line in f: if len(line) <= 1: continue @@ -15,7 +17,8 @@ def process_dataset(base_path, dataset_name, time_param, edge_threshold): records.append((int(src), int(dst), int(time))) labels = [] - with open(base_path + dataset_name + "/Label.csv", "r") as f: + labels_path = os.path.join(base_path, dataset_name, "Label.csv") + with open(labels_path, "r") as f: for line in f: if len(line) <= 1: continue From f5a1ec6cf8feaae64c3c5763451eba2525f5d6f9 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Tue, 20 Aug 2024 17:09:54 +0330 Subject: [PATCH 23/27] refactor process_data.py --- code/process_data.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/code/process_data.py b/code/process_data.py index 88ee80a..50d0ac6 100644 --- a/code/process_data.py +++ b/code/process_data.py @@ -7,6 +7,29 @@ def process_dataset(base_path, dataset_name, time_param, edge_threshold): + record_labels = generate_record_labels(base_path, dataset_name) + labels = generate_final_labels(edge_threshold, record_labels, time_param) + write_to_file(base_path, dataset_name, edge_threshold, labels, time_param) + + +def write_to_file(base_path, dataset_name, edge_threshold, labels, time_param): + file_path = os.path.join(base_path, dataset_name, f"Label_{str(time_param)}_{str(edge_threshold)}.csv") + savetxt(file_path, labels, delimiter='\n', fmt="%d") + + +def generate_final_labels(edge_threshold, record_labels, time_param): + data = pd.DataFrame(np.array(record_labels)) + labels = [] + data[2] = (data[2] / time_param).astype(int) + for i in pd.unique(data[2]): + labels.append(sum(data[data[2] == i][3])) + labels = np.array(labels) + labels = labels >= edge_threshold + labels = labels * 1 + return labels + + +def generate_record_labels(base_path, dataset_name): records = [] data_path = os.path.join(base_path, dataset_name, "Data.csv") with open(data_path, "r") as f: @@ -29,19 +52,7 @@ def process_dataset(base_path, dataset_name, time_param, edge_threshold): record_labels = [(record[0], record[1], record[2], label) for record, label in zip(records, labels)] - write_format = str(time_param) + "_" + str(edge_threshold) - - data = pd.DataFrame(np.array(record_labels)) - - labels = [] - data[2] = (data[2] / time_param).astype(int) - for i in pd.unique(data[2]): - labels.append(sum(data[data[2] == i][3])) - - labels = np.array(labels) - labels = labels >= edge_threshold - labels = labels * 1 - savetxt(base_path + dataset_name + "/Label_" + write_format + ".csv", labels, delimiter='\n', fmt="%d") + return record_labels def main(): From 69d14a20fe441e45e09d67b1772ec6b3a20ff000 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Tue, 20 Aug 2024 17:43:11 +0330 Subject: [PATCH 24/27] Add code README.MD --- code/README.MD | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 code/README.MD diff --git a/code/README.MD b/code/README.MD new file mode 100644 index 0000000..1751a32 --- /dev/null +++ b/code/README.MD @@ -0,0 +1,11 @@ +# توضیح بخش‌های مختلف کد + +## process_data.py + +در این فایل پردازش داده‌ه‌ها به صورت زیر انجام می‌شود. + +داده‌های زمانی براساس پارامتر مشخصی گسسته‌سازی می‌شوند و به ازای هر +time_stamp +زمان یک برچسب مشخص می‌شود. + +روش تضمیم‌گیری برای برچسب به این‌صورت است که اگر بیش از تعداد مشخصی از یال‌های آن دسته‌ی زمانی برچسب مثبت داشته باشند آن لحظه‌ی زمانی به صورت کامل مثبت اعلام می‌شود. \ No newline at end of file From 5daf0561c46b896719b680286884f6674153b376 Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Tue, 20 Aug 2024 17:50:38 +0330 Subject: [PATCH 25/27] complete README.MD --- code/README.MD | 17 ++++++++++++++++- code/process_data.py | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/code/README.MD b/code/README.MD index 1751a32..1e43527 100644 --- a/code/README.MD +++ b/code/README.MD @@ -8,4 +8,19 @@ time_stamp زمان یک برچسب مشخص می‌شود. -روش تضمیم‌گیری برای برچسب به این‌صورت است که اگر بیش از تعداد مشخصی از یال‌های آن دسته‌ی زمانی برچسب مثبت داشته باشند آن لحظه‌ی زمانی به صورت کامل مثبت اعلام می‌شود. \ No newline at end of file +روش تضمیم‌گیری برای برچسب به این‌صورت است که اگر بیش از تعداد مشخصی از یال‌های آن دسته‌ی زمانی برچسب مثبت داشته باشند آن لحظه‌ی زمانی به صورت کامل مثبت اعلام می‌شود. + +```python +def generate_final_labels(edge_threshold, record_labels, time_param): + data = pd.DataFrame(np.array(record_labels)) + labels = [] + data[2] = (data[2] / time_param).astype(int) + + for i in pd.unique(data[2]): + labels.append(sum(data[data[2] == i][3])) + + labels = np.array(labels) + labels = labels >= edge_threshold + labels = labels * 1 + return labels +``` \ No newline at end of file diff --git a/code/process_data.py b/code/process_data.py index 50d0ac6..03e0696 100644 --- a/code/process_data.py +++ b/code/process_data.py @@ -21,8 +21,10 @@ def generate_final_labels(edge_threshold, record_labels, time_param): data = pd.DataFrame(np.array(record_labels)) labels = [] data[2] = (data[2] / time_param).astype(int) + for i in pd.unique(data[2]): labels.append(sum(data[data[2] == i][3])) + labels = np.array(labels) labels = labels >= edge_threshold labels = labels * 1 From 3af75e252364c49c4e974659a44a755a8b6dcfcf Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Tue, 20 Aug 2024 17:51:16 +0330 Subject: [PATCH 26/27] add python configs --- .idea/misc.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.idea/misc.xml b/.idea/misc.xml index 7859412..48b747e 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,5 +1,8 @@ + + \ No newline at end of file From c9cee155ca8471558fb821f7048cf333ecb61f5b Mon Sep 17 00:00:00 2001 From: Mohammad Kasaei Date: Tue, 20 Aug 2024 17:55:52 +0330 Subject: [PATCH 27/27] add comments to .sh file --- code/demo.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/code/demo.sh b/code/demo.sh index 73c11aa..1dc894b 100644 --- a/code/demo.sh +++ b/code/demo.sh @@ -13,9 +13,19 @@ if [ $1 == "DARPA" ]; then ./main anograph_k DARPA 30 50 2 32 5 echo "Running AnoEdge-G" + # Algorithm => anoedge_g + # Dataset => DARPA + # Rows => 2 + # Buckets => 32 + # Decay factor => 0.9 ./main anoedge_g DARPA 2 32 0.9 echo "Running AnoEdge-L" + # Algorithm => anoedge_g + # Dataset => DARPA + # Rows => 2 + # Buckets => 32 + # Decay factor => 0.9 ./main anoedge_l DARPA 2 32 0.9 echo "Installing python dependencies"