11from pyspark .sql .functions import mean , mode , stddev , count , median , sum , min , max , col , lit , count_distinct , unix_timestamp , lag , first , when , monotonically_increasing_id
2- from pyspark .sql import SparkSession , DataFrame
2+ from pyspark .sql import DataFrame
33from pyspark .sql .window import Window
4- from scripts .local .shared .schemas import transaction_schema
54
65def calculate_aggregations (df ):
76 sender_window = Window .partitionBy ("sender_address" ).orderBy ("block_timestamp" )
@@ -147,39 +146,26 @@ def preprocess_btc_df(df):
147146
148147 return df_btc_send .unionByName (df_btc_receive )
149148
150-
151- spark = (
152- SparkSession .builder .appName ("DataAggregations" )
153- .config ("spark.sql.parquet.enableVectorizedReader" , "true" )
154- .config ("spark.sql.parquet.mergeSchema" , "false" ) # No need as we explicitly specify the schema
155- .config ("spark.executor.memory" , "6g" )
156- .config ("spark.driver.memory" , "2g" )
157- # .config("spark.local.dir", "/mnt/d/spark-temp") # Change the temp directory
158- .getOrCreate ()
159- )
160-
161- source_dir = "data/historical/etl/transactions" #data/benchmark/etl/transactions or data/historical/etl/transactions
162- output_dir = "data/historical/aggregations" #data/benchmark/aggregations or data/historical/aggregations
163-
164- cols_to_drop = ["transaction_id" , "block_number" , "transaction_index" ]
165- transaction_df = spark .read .schema (transaction_schema ).parquet (source_dir ).drop (* cols_to_drop )
149+ def aggregate (spark , source_dir , output_dir , schema ):
150+ cols_to_drop = ["transaction_id" , "block_number" , "transaction_index" ]
151+ transaction_df = spark .read .schema (schema ).parquet (source_dir ).drop (* cols_to_drop )
166152
167- unique_degrees_df = calculate_unique_degrees (transaction_df )
153+ unique_degrees_df = calculate_unique_degrees (transaction_df )
168154
169- df_eth = transaction_df .where (col ("network_name" ) == "ethereum" )
170- df_btc = transaction_df .where (col ("network_name" ) == "bitcoin" )
155+ df_eth = transaction_df .where (col ("network_name" ) == "ethereum" )
156+ df_btc = transaction_df .where (col ("network_name" ) == "bitcoin" )
171157
172- df_btc = preprocess_btc_df (df_btc )
158+ df_btc = preprocess_btc_df (df_btc )
173159
174- df_btc_aggregations = calculate_aggregations (df_btc )
175- df_eth_aggregations = calculate_aggregations (df_eth )
160+ df_btc_aggregations = calculate_aggregations (df_btc )
161+ df_eth_aggregations = calculate_aggregations (df_eth )
176162
177- df_btc_aggregations = df_btc_aggregations .withColumn ("network_name" , lit ("bitcoin" ))
178- df_eth_aggregations = df_eth_aggregations .withColumn ("network_name" , lit ("ethereum" ))
163+ df_btc_aggregations = df_btc_aggregations .withColumn ("network_name" , lit ("bitcoin" ))
164+ df_eth_aggregations = df_eth_aggregations .withColumn ("network_name" , lit ("ethereum" ))
179165
180- aggregations_df = df_btc_aggregations .unionByName (df_eth_aggregations )
181- aggregations_df = aggregations_df .join (unique_degrees_df , "address" , "outer" ).na .fill (0 )
166+ aggregations_df = df_btc_aggregations .unionByName (df_eth_aggregations )
167+ aggregations_df = aggregations_df .join (unique_degrees_df , "address" , "outer" ).na .fill (0 )
182168
183- aggregations_df .coalesce (1 ).write .parquet (output_dir , mode = "overwrite" , compression = "zstd" )
169+ aggregations_df .coalesce (1 ).write .parquet (output_dir , mode = "overwrite" , compression = "zstd" )
184170
185- spark .stop ()
171+ spark .stop ()
0 commit comments