Skip to content

Commit ab3df95

Browse files
authored
Features/long awaited changes (#44)
* Isolation forest and unscaled features * import fix
1 parent 8cd24f3 commit ab3df95

File tree

10 files changed

+872
-33
lines changed

10 files changed

+872
-33
lines changed

infrastructure/main.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ module "glue_catalog" {
1212
glue_role_arn = module.iam.glue_role_arn
1313
bdp_cleaned_transactions_bucket = module.s3_buckets.bdp_cleaned_transactions_bucket
1414
bdp_wallets_aggregations_bucket = module.s3_buckets.bdp_wallets_aggregations_bucket
15-
bdp_features_bucket = module.s3_buckets.bdp_features_bucket
15+
bdp_scaled_features_bucket = module.s3_buckets.bdp_scaled_features_bucket
16+
bdp_unscaled_features_bucket = module.s3_buckets.bdp_unscaled_features_bucket
1617
}
1718

1819
module "iam_github_role" {

infrastructure/modules/glue_catalog/main.tf

Lines changed: 320 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -395,9 +395,9 @@ resource "aws_glue_catalog_table_optimizer" "wallets_aggregations_compaction_opt
395395
}
396396
}
397397

398-
/*resource "aws_glue_catalog_table" "features" {
398+
/*resource "aws_glue_catalog_table" "scaled_features" {
399399
database_name = aws_glue_catalog_database.bdp_db.name
400-
name = "features"
400+
name = "scaled_features"
401401
402402
table_type = "EXTERNAL_TABLE"
403403
open_table_format_input {
@@ -418,7 +418,7 @@ resource "aws_glue_catalog_table_optimizer" "wallets_aggregations_compaction_opt
418418
}
419419
420420
storage_descriptor {
421-
location = "s3://${var.bdp_features_bucket}"
421+
location = "s3://${var.bdp_scaled_features_bucket}"
422422
input_format = "org.apache.hadoop.mapred.FileInputFormat"
423423
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
424424
compressed = true
@@ -677,10 +677,10 @@ resource "aws_glue_catalog_table_optimizer" "wallets_aggregations_compaction_opt
677677
}*/
678678

679679

680-
resource "aws_glue_catalog_table_optimizer" "features_orphan_files_deletion_optimizer" {
680+
resource "aws_glue_catalog_table_optimizer" "scaled_features_orphan_files_deletion_optimizer" {
681681
catalog_id = "982534349340"
682682
database_name = aws_glue_catalog_database.bdp_db.name
683-
table_name = "features"
683+
table_name = "scaled_features"
684684
type = "orphan_file_deletion"
685685

686686
configuration {
@@ -690,16 +690,328 @@ resource "aws_glue_catalog_table_optimizer" "features_orphan_files_deletion_opti
690690
orphan_file_deletion_configuration {
691691
iceberg_configuration {
692692
orphan_file_retention_period_in_days = 2
693-
location = "s3://${var.bdp_features_bucket}"
693+
location = "s3://${var.bdp_scaled_features_bucket}"
694694
}
695695
}
696696
}
697697
}
698698

699-
resource "aws_glue_catalog_table_optimizer" "features_compaction_optimizer" {
699+
resource "aws_glue_catalog_table_optimizer" "scaled_features_compaction_optimizer" {
700700
catalog_id = "982534349340"
701701
database_name = aws_glue_catalog_database.bdp_db.name
702-
table_name = "features"
702+
table_name = "scaled_features"
703+
type = "compaction"
704+
705+
configuration {
706+
role_arn = var.glue_role_arn
707+
enabled = true
708+
}
709+
}
710+
711+
/*resource "aws_glue_catalog_table" "unscaled_features" {
712+
database_name = aws_glue_catalog_database.bdp_db.name
713+
name = "unscaled_features"
714+
715+
table_type = "EXTERNAL_TABLE"
716+
open_table_format_input {
717+
iceberg_input {
718+
metadata_operation = "CREATE"
719+
}
720+
}
721+
722+
//Commented because https://github.com/hashicorp/terraform-provider-aws/issues/36531
723+
partition_keys {
724+
name = "network_name"
725+
type = "boolean"
726+
}
727+
728+
parameters = {
729+
"write.format.default" = "parquet",
730+
"write.parquet.compression-codec" = "zstd"
731+
}
732+
733+
storage_descriptor {
734+
location = "s3://${var.bdp_unscaled_features_bucket}"
735+
input_format = "org.apache.hadoop.mapred.FileInputFormat"
736+
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
737+
compressed = true
738+
739+
ser_de_info {
740+
name = "features_serde"
741+
serialization_library = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
742+
}
743+
744+
columns {
745+
name = "block_timestamp"
746+
type = "bigint"
747+
}
748+
columns {
749+
name = "block_number"
750+
type = "bigint"
751+
}
752+
columns {
753+
name = "transaction_index"
754+
type = "bigint"
755+
}
756+
columns {
757+
name = "fee"
758+
type = "float"
759+
}
760+
columns {
761+
name = "total_transferred_value"
762+
type = "float"
763+
}
764+
columns {
765+
name = "total_input_value"
766+
type = "float"
767+
}
768+
columns {
769+
name = "sent_value"
770+
type = "float"
771+
}
772+
columns {
773+
name = "received_value"
774+
type = "float"
775+
}
776+
columns {
777+
name = "network_name"
778+
type = "boolean"
779+
}
780+
columns {
781+
name = "avg_sent_value"
782+
type = "float"
783+
}
784+
columns {
785+
name = "avg_received_value"
786+
type = "float"
787+
}
788+
columns {
789+
name = "avg_total_value_for_sender"
790+
type = "float"
791+
}
792+
columns {
793+
name = "avg_total_value_for_receiver"
794+
type = "float"
795+
}
796+
columns {
797+
name = "sum_sent_value"
798+
type = "float"
799+
}
800+
columns {
801+
name = "sum_received_value"
802+
type = "float"
803+
}
804+
columns {
805+
name = "sum_total_value_for_sender"
806+
type = "float"
807+
}
808+
columns {
809+
name = "sum_total_value_for_receiver"
810+
type = "float"
811+
}
812+
columns {
813+
name = "min_sent_value"
814+
type = "float"
815+
}
816+
columns {
817+
name = "min_received_value"
818+
type = "float"
819+
}
820+
columns {
821+
name = "min_total_value_for_sender"
822+
type = "float"
823+
}
824+
columns {
825+
name = "min_total_value_for_receiver"
826+
type = "float"
827+
}
828+
columns {
829+
name = "max_sent_value"
830+
type = "float"
831+
}
832+
columns {
833+
name = "max_received_value"
834+
type = "float"
835+
}
836+
columns {
837+
name = "max_total_value_for_sender"
838+
type = "float"
839+
}
840+
columns {
841+
name = "max_total_value_for_receiver"
842+
type = "float"
843+
}
844+
columns {
845+
name = "median_sent_value"
846+
type = "float"
847+
}
848+
columns {
849+
name = "median_received_value"
850+
type = "float"
851+
}
852+
columns {
853+
name = "median_total_value_for_sender"
854+
type = "float"
855+
}
856+
columns {
857+
name = "median_total_value_for_receiver"
858+
type = "float"
859+
}
860+
columns {
861+
name = "mode_sent_value"
862+
type = "float"
863+
}
864+
columns {
865+
name = "mode_received_value"
866+
type = "float"
867+
}
868+
columns {
869+
name = "mode_total_value_for_sender"
870+
type = "float"
871+
}
872+
columns {
873+
name = "mode_total_value_for_receiver"
874+
type = "float"
875+
}
876+
columns {
877+
name = "stddev_sent_value"
878+
type = "float"
879+
}
880+
columns {
881+
name = "stddev_received_value"
882+
type = "float"
883+
}
884+
columns {
885+
name = "stddev_total_value_for_sender"
886+
type = "float"
887+
}
888+
columns {
889+
name = "stddev_total_value_for_receiver"
890+
type = "float"
891+
}
892+
columns {
893+
name = "num_sent_transactions"
894+
type = "bigint"
895+
}
896+
columns {
897+
name = "num_received_transactions"
898+
type = "bigint"
899+
}
900+
columns {
901+
name = "avg_time_between_sent_transactions"
902+
type = "float"
903+
}
904+
columns {
905+
name = "avg_time_between_received_transactions"
906+
type = "float"
907+
}
908+
columns {
909+
name = "avg_outgoing_speed_count"
910+
type = "float"
911+
}
912+
columns {
913+
name = "avg_incoming_speed_count"
914+
type = "float"
915+
}
916+
columns {
917+
name = "avg_outgoing_speed_value"
918+
type = "float"
919+
}
920+
columns {
921+
name = "avg_incoming_speed_value"
922+
type = "float"
923+
}
924+
columns {
925+
name = "avg_outgoing_acceleration_count"
926+
type = "float"
927+
}
928+
columns {
929+
name = "avg_incoming_acceleration_count"
930+
type = "float"
931+
}
932+
columns {
933+
name = "avg_outgoing_acceleration_value"
934+
type = "float"
935+
}
936+
columns {
937+
name = "avg_incoming_acceleration_value"
938+
type = "float"
939+
}
940+
columns {
941+
name = "avg_fee_paid"
942+
type = "float"
943+
}
944+
columns {
945+
name = "total_fee_paid"
946+
type = "float"
947+
}
948+
columns {
949+
name = "min_fee_paid"
950+
type = "float"
951+
}
952+
columns {
953+
name = "max_fee_paid"
954+
type = "float"
955+
}
956+
columns {
957+
name = "activity_duration_for_sender"
958+
type = "bigint"
959+
}
960+
columns {
961+
name = "first_transaction_timestamp_for_sender"
962+
type = "bigint"
963+
}
964+
columns {
965+
name = "last_transaction_timestamp_for_sender"
966+
type = "bigint"
967+
}
968+
columns {
969+
name = "activity_duration_for_receiver"
970+
type = "bigint"
971+
}
972+
columns {
973+
name = "first_transaction_timestamp_for_receiver"
974+
type = "bigint"
975+
}
976+
columns {
977+
name = "last_transaction_timestamp_for_receiver"
978+
type = "bigint"
979+
}
980+
columns {
981+
name = "unique_out_degree"
982+
type = "bigint"
983+
}
984+
columns {
985+
name = "unique_in_degree"
986+
type = "bigint"
987+
}
988+
989+
}
990+
}*/
991+
992+
resource "aws_glue_catalog_table_optimizer" "unscaled_features_orphan_files_deletion_optimizer" {
993+
catalog_id = "982534349340"
994+
database_name = aws_glue_catalog_database.bdp_db.name
995+
table_name = "unscaled_features"
996+
type = "orphan_file_deletion"
997+
998+
configuration {
999+
role_arn = var.glue_role_arn
1000+
enabled = true
1001+
1002+
orphan_file_deletion_configuration {
1003+
iceberg_configuration {
1004+
orphan_file_retention_period_in_days = 2
1005+
location = "s3://${var.bdp_unscaled_features_bucket}"
1006+
}
1007+
}
1008+
}
1009+
}
1010+
1011+
resource "aws_glue_catalog_table_optimizer" "unscaled_features_compaction_optimizer" {
1012+
catalog_id = "982534349340"
1013+
database_name = aws_glue_catalog_database.bdp_db.name
1014+
table_name = "unscaled_features"
7031015
type = "compaction"
7041016

7051017
configuration {

infrastructure/modules/glue_catalog/variables.tf

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,14 @@ variable "bdp_wallets_aggregations_bucket" {
88
description = "Wallets aggregations bucket name"
99
}
1010

11-
variable "bdp_features_bucket" {
11+
variable "bdp_scaled_features_bucket" {
1212
type = string
13-
description = "Features bucket name"
13+
description = "Scaled features bucket name"
14+
}
15+
16+
variable "bdp_unscaled_features_bucket" {
17+
type = string
18+
description = "Unscaled features bucket name"
1419
}
1520

1621
variable "glue_role_arn" {

0 commit comments

Comments
 (0)