WikarNotAvailable
diff --git a/‎scripts/local/aggregate/aggregate.py‎
Lines changed: 18 additions & 0 deletions b/‎scripts/local/aggregate/aggregate.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎scripts/local/aggregate/aggregate_benchmark.py‎
Lines changed: 18 additions & 0 deletions b/‎scripts/local/aggregate/aggregate_benchmark.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎scripts/local/etl/aggregations.py‎ ‎…cal/aggregate/components/aggregations.py‎scripts/local/etl/aggregations.py renamed to scripts/local/aggregate/components/aggregations.py
Lines changed: 16 additions & 30 deletions b/‎scripts/local/etl/aggregations.py‎ ‎…cal/aggregate/components/aggregations.py‎scripts/local/etl/aggregations.py renamed to scripts/local/aggregate/components/aggregations.py
Lines changed: 16 additions & 30 deletions
diff --git a/‎scripts/local/anomalies_detection/preprocessing/components/preprocess_data_second_step.py‎
Lines changed: 0 additions & 11 deletions b/‎scripts/local/anomalies_detection/preprocessing/components/preprocess_data_second_step.py‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎scripts/local/anomalies_detection/preprocessing/preprocess_benchmark_datasets.py‎
Lines changed: 0 additions & 23 deletions b/‎scripts/local/anomalies_detection/preprocessing/preprocess_benchmark_datasets.py‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎scripts/local/etl/benchmark_components/etl.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/local/etl/benchmark_components/etl.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/local/etl/main_benchmark.py‎ ‎…ripts/local/etl/perform_benchmark_etl.py‎scripts/local/etl/main_benchmark.py renamed to scripts/local/etl/perform_benchmark_etl.py b/‎scripts/local/etl/main_benchmark.py‎ ‎…ripts/local/etl/perform_benchmark_etl.py‎scripts/local/etl/main_benchmark.py renamed to scripts/local/etl/perform_benchmark_etl.py
diff --git a/‎scripts/local/etl/main.py‎ ‎scripts/local/etl/perform_etl.py‎scripts/local/etl/main.py renamed to scripts/local/etl/perform_etl.py b/‎scripts/local/etl/main.py‎ ‎scripts/local/etl/perform_etl.py‎scripts/local/etl/main.py renamed to scripts/local/etl/perform_etl.py
diff --git a/‎…s/join_transactions_with_aggregations.py‎ ‎…s/join_transactions_with_aggregations.py‎scripts/local/anomalies_detection/preprocessing/components/join_transactions_with_aggregations.py renamed to scripts/local/preprocessing/components/join_transactions_with_aggregations.py
Lines changed: 3 additions & 4 deletions b/‎…s/join_transactions_with_aggregations.py‎ ‎…s/join_transactions_with_aggregations.py‎scripts/local/anomalies_detection/preprocessing/components/join_transactions_with_aggregations.py renamed to scripts/local/preprocessing/components/join_transactions_with_aggregations.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎…components/preprocess_data_first_step.py‎ ‎…components/preprocess_data_first_step.py‎scripts/local/anomalies_detection/preprocessing/components/preprocess_data_first_step.py renamed to scripts/local/preprocessing/components/preprocess_data_first_step.py b/‎…components/preprocess_data_first_step.py‎ ‎…components/preprocess_data_first_step.py‎scripts/local/anomalies_detection/preprocessing/components/preprocess_data_first_step.py renamed to scripts/local/preprocessing/components/preprocess_data_first_step.py
@@ -0,0 +1,18 @@
+from pyspark.sql import SparkSession
+from scripts.local.shared.schemas import transaction_schema
+from scripts.local.aggregate.components.aggregations import aggregate
+
+spark = (
+    SparkSession.builder.appName("DataAggregations")    
+    .config("spark.sql.parquet.enableVectorizedReader", "true")
+    .config("spark.sql.parquet.mergeSchema", "false") # No need as we explicitly specify the schema
+    .config("spark.executor.memory", "6g")
+    .config("spark.driver.memory", "2g")
+    # .config("spark.local.dir", "/mnt/d/spark-temp") # Change the temp directory
+    .getOrCreate()
+)
+
+source_dir = "data/historical/etl/transactions"
+output_dir = "data/historical/aggregations" 
+
+aggregate(spark, source_dir, output_dir, transaction_schema)
@@ -0,0 +1,18 @@
+from pyspark.sql import SparkSession
+from scripts.local.shared.benchmark_schemas import benchmark_transaction_schema
+from components.aggregations import aggregate
+
+spark = (
+    SparkSession.builder.appName("DataAggregations")    
+    .config("spark.sql.parquet.enableVectorizedReader", "true")
+    .config("spark.sql.parquet.mergeSchema", "false") # No need as we explicitly specify the schema
+    .config("spark.executor.memory", "6g")
+    .config("spark.driver.memory", "2g")
+    # .config("spark.local.dir", "/mnt/d/spark-temp") # Change the temp directory
+    .getOrCreate()
+)
+
+source_dir = "data/benchmark/etl/transactions" 
+output_dir = "data/benchmark/aggregations" 
+
+aggregate(spark, source_dir, output_dir, benchmark_transaction_schema)
@@ -1,7 +1,6 @@
 from pyspark.sql.functions import mean, mode, stddev, count, median, sum, min, max, col, lit, count_distinct, unix_timestamp, lag, first, when, monotonically_increasing_id
-from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql import DataFrame
 from pyspark.sql.window import Window
-from scripts.local.shared.schemas import transaction_schema
 
 def calculate_aggregations(df):
     sender_window = Window.partitionBy("sender_address").orderBy("block_timestamp")
@@ -147,39 +146,26 @@ def preprocess_btc_df(df):
 
     return df_btc_send.unionByName(df_btc_receive)
 
-    
-spark = (
-    SparkSession.builder.appName("DataAggregations")    
-    .config("spark.sql.parquet.enableVectorizedReader", "true")
-    .config("spark.sql.parquet.mergeSchema", "false") # No need as we explicitly specify the schema
-    .config("spark.executor.memory", "6g")
-    .config("spark.driver.memory", "2g")
-    # .config("spark.local.dir", "/mnt/d/spark-temp") # Change the temp directory
-    .getOrCreate()
-)
-
-source_dir = "data/historical/etl/transactions" #data/benchmark/etl/transactions or data/historical/etl/transactions
-output_dir = "data/historical/aggregations" #data/benchmark/aggregations or data/historical/aggregations
-
-cols_to_drop = ["transaction_id", "block_number", "transaction_index"]
-transaction_df = spark.read.schema(transaction_schema).parquet(source_dir).drop(*cols_to_drop)
+def aggregate(spark, source_dir, output_dir, schema):
+    cols_to_drop = ["transaction_id", "block_number", "transaction_index"]
+    transaction_df = spark.read.schema(schema).parquet(source_dir).drop(*cols_to_drop)
 
-unique_degrees_df = calculate_unique_degrees(transaction_df)
+    unique_degrees_df = calculate_unique_degrees(transaction_df)
 
-df_eth = transaction_df.where(col("network_name") == "ethereum")
-df_btc = transaction_df.where(col("network_name") == "bitcoin")
+    df_eth = transaction_df.where(col("network_name") == "ethereum")
+    df_btc = transaction_df.where(col("network_name") == "bitcoin")
 
-df_btc = preprocess_btc_df(df_btc)
+    df_btc = preprocess_btc_df(df_btc)
 
-df_btc_aggregations = calculate_aggregations(df_btc)
-df_eth_aggregations = calculate_aggregations(df_eth)
+    df_btc_aggregations = calculate_aggregations(df_btc)
+    df_eth_aggregations = calculate_aggregations(df_eth)
 
-df_btc_aggregations = df_btc_aggregations.withColumn("network_name", lit("bitcoin"))
-df_eth_aggregations = df_eth_aggregations.withColumn("network_name", lit("ethereum"))
+    df_btc_aggregations = df_btc_aggregations.withColumn("network_name", lit("bitcoin"))
+    df_eth_aggregations = df_eth_aggregations.withColumn("network_name", lit("ethereum"))
 
-aggregations_df = df_btc_aggregations.unionByName(df_eth_aggregations)
-aggregations_df = aggregations_df.join(unique_degrees_df, "address", "outer").na.fill(0)
+    aggregations_df = df_btc_aggregations.unionByName(df_eth_aggregations)
+    aggregations_df = aggregations_df.join(unique_degrees_df, "address", "outer").na.fill(0)
 
-aggregations_df.coalesce(1).write.parquet(output_dir, mode="overwrite", compression="zstd")
+    aggregations_df.coalesce(1).write.parquet(output_dir, mode="overwrite", compression="zstd")
 
-spark.stop()
+    spark.stop()
@@ -1,7 +1,7 @@
 from functools import reduce
 from typing import Callable
 from pyspark.sql import SparkSession, DataFrame
-from scripts.local.shared.schemas import benchmark_input_schema
+from scripts.local.shared.benchmark_schemas import benchmark_input_schema
 
 def extract(spark: SparkSession, path: str) -> DataFrame:
     df = spark.read.schema(benchmark_input_schema).csv(path, header=True)
 
@@ -1,9 +1,8 @@
 from pyspark.sql.functions import col
-from scripts.local.shared.schemas import transaction_scaled_schema, aggregations_scaled_schema
 from scripts.local.shared.consts import sender_fields, receiver_fields, common_fields
 
-def join_transactions_with_aggregations(spark, transactions_dir, aggregations_dir, output_dir):
-    transactions_df = spark.read.schema(transaction_scaled_schema).parquet(transactions_dir)
+def join_transactions_with_aggregations(spark, transactions_dir, aggregations_dir, output_dir, transactions_scaled_schema, aggregations_scaled_schema):
+    transactions_df = spark.read.schema(transactions_scaled_schema).parquet(transactions_dir)
     aggregations_df = spark.read.schema(aggregations_scaled_schema).parquet(aggregations_dir)
 
     sender_aggregations = aggregations_df.select(
@@ -30,4 +29,4 @@ def join_transactions_with_aggregations(spark, transactions_dir, aggregations_di
         compression="zstd"
     )
 
-    return final_df
+    return final_df