Skip to content

Commit 5ce75e5

Browse files
committed
finished data validation. Artifacts and more logs will be added later
1 parent b9b34a6 commit 5ce75e5

File tree

12 files changed

+335
-22125
lines changed

12 files changed

+335
-22125
lines changed

Artifacts/10_15_2025_16_06_42/data_ingestion/feature_store/phishingData.csv

Lines changed: 0 additions & 11056 deletions
This file was deleted.

Artifacts/10_15_2025_16_06_42/data_ingestion/ingested/test.csv

Lines changed: 0 additions & 2212 deletions
This file was deleted.

Artifacts/10_15_2025_16_06_42/data_ingestion/ingested/train.csv

Lines changed: 0 additions & 8845 deletions
This file was deleted.

data_schema/schema.yaml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
columns:
2+
- having_IP_Address: int64
3+
- URL_Length: int64
4+
- Shortining_Service: int64
5+
- having_At_Symbol: int64
6+
- double_slash_redirecting: int64
7+
- Prefix_Suffix: int64
8+
- having_Sub_Domain: int64
9+
- SSLfinal_State: int64
10+
- Domain_registeration_length: int64
11+
- Favicon: int64
12+
- port: int64
13+
- HTTPS_token: int64
14+
- Request_URL: int64
15+
- URL_of_Anchor: int64
16+
- Links_in_tags: int64
17+
- SFH: int64
18+
- Submitting_to_email: int64
19+
- Abnormal_URL: int64
20+
- Redirect: int64
21+
- on_mouseover: int64
22+
- RightClick: int64
23+
- popUpWidnow: int64
24+
- Iframe: int64
25+
- age_of_domain: int64
26+
- DNSRecord: int64
27+
- web_traffic: int64
28+
- Page_Rank: int64
29+
- Google_Index: int64
30+
- Links_pointing_to_page: int64
31+
- Statistical_report: int64
32+
- Result: int64
33+
34+
35+
numerical_columns:
36+
- having_IP_Address
37+
- URL_Length
38+
- Shortining_Service
39+
- having_At_Symbol
40+
- double_slash_redirecting
41+
- Prefix_Suffix
42+
- having_Sub_Domain
43+
- SSLfinal_State
44+
- Domain_registeration_length
45+
- Favicon
46+
- port
47+
- HTTPS_token
48+
- Request_URL
49+
- URL_of_Anchor
50+
- Links_in_tags
51+
- SFH
52+
- Submitting_to_email
53+
- Abnormal_URL
54+
- Redirect
55+
- on_mouseover
56+
- RightClick
57+
- popUpWidnow
58+
- Iframe
59+
- age_of_domain
60+
- DNSRecord
61+
- web_traffic
62+
- Page_Rank
63+
- Google_Index
64+
- Links_pointing_to_page
65+
- Statistical_report
66+
- Result

main.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from network_security.components.data_ingestion import DataIngestion
2+
from network_security.components.data_validation import DataValidation
23
from network_security.exceptions.exception import NetworkSecurityException
34
from network_security.logging.logger import logging
4-
from network_security.entity.config_entity import DataIngestionConfig, TrainingPipelineConfig
5+
from network_security.entity.config_entity import DataIngestionConfig, DataValidationConfig, TrainingPipelineConfig
56
import sys
67

78

@@ -11,12 +12,28 @@
1112
logging.info("Started data ingestion")
1213

1314
training_pipeline_config = TrainingPipelineConfig()
15+
16+
# data ingestion configuration
1417
data_ingestion_config = DataIngestionConfig(training_pipeline_config=training_pipeline_config)
1518
data_ingestion = DataIngestion(data_ingestion_config=data_ingestion_config)
1619

1720
# initiating the data ingestion process
1821
logging.info("Initiating data ingestion")
1922
data_ingestion_artifact = data_ingestion.initiate_data_ingestion()
20-
print(data_ingestion_artifact)
23+
print(f"Data Ingestion Artifact: \n{data_ingestion_artifact} \n")
24+
logging.info("Data ingestion completed")
25+
26+
# data validation configuration
27+
data_validation_config = DataValidationConfig(training_pipeline_config=training_pipeline_config)
28+
data_validation = DataValidation(
29+
data_ingestion_artifact = data_ingestion_artifact,
30+
data_validation_config = data_validation_config
31+
)
32+
33+
# initiating data validation
34+
logging.info("Initiating data validation")
35+
36+
data_validation_artifact = data_validation.initiate_data_validation()
37+
print(f"Data Validation Artifact: \n{data_validation_artifact} \n")
2138
except Exception as e:
2239
raise NetworkSecurityException(e, sys)
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
from network_security.entity.artifact_entity import DataIngestionArtifact, DataValidationArtifact
2+
from network_security.entity.config_entity import DataValidationConfig
3+
from network_security.exceptions.exception import NetworkSecurityException
4+
from network_security.logging.logger import logging
5+
from network_security.utils.main_utils.utils import read_yaml_file, write_yaml_file
6+
from network_security.constants.training_pipeline import SCHEMA_FILE_PATH
7+
8+
from scipy.stats import ks_2samp # helps with detecting drifting in data
9+
import pandas as pd
10+
import numpy as np
11+
import os, sys
12+
13+
14+
class DataValidation:
15+
"""
16+
Static methods
17+
"""
18+
@staticmethod
19+
def read_data(file_path: str) -> pd.DataFrame:
20+
try:
21+
return pd.read_csv(file_path)
22+
except Exception as e:
23+
raise NetworkSecurityException(e, sys)
24+
25+
"""
26+
Class methods start here
27+
"""
28+
def __init__(self,
29+
data_ingestion_artifact: DataIngestionArtifact,
30+
data_validation_config: DataValidationConfig ):
31+
try:
32+
self.data_ingestion_artifact = data_ingestion_artifact
33+
self.data_validation_config = data_validation_config
34+
self._schema_config = read_yaml_file(SCHEMA_FILE_PATH)
35+
except Exception as e:
36+
raise NetworkSecurityException(e, sys)
37+
38+
def initiate_data_validation(self) -> DataValidationArtifact:
39+
try:
40+
train_file_path = self.data_ingestion_artifact.train_file_path
41+
test_file_path = self.data_ingestion_artifact.test_file_path
42+
logging.info("Reading train and test data for validation")
43+
44+
# reading train and test data
45+
train_df = DataValidation.read_data(train_file_path)
46+
test_df = DataValidation.read_data(test_file_path)
47+
48+
# validating number of columns in train dataframe
49+
status = self.validate_number_of_columns(train_df)
50+
if not status:
51+
logging.info("Number of columns in train dataframe are not as per schema")
52+
53+
# validating number of columns in test dataframe
54+
status = self.validate_number_of_columns(test_df)
55+
if not status:
56+
logging.info("Number of columns in test dataframe are not as per schema")
57+
58+
# checking for data drift
59+
status = self.detect_data_drift(base_df=train_df, current_df=test_df)
60+
dir_path = os.path.dirname(self.data_validation_config.valid_train_file_path)
61+
os.makedirs(dir_path, exist_ok=True)
62+
63+
# saving the validated train and test data in their respective paths
64+
train_df.to_csv(self.data_validation_config.valid_train_file_path, index=False, header=True)
65+
test_df.to_csv(self.data_validation_config.valid_test_file_path, index=False, header=True)
66+
67+
data_validation_artifact = DataValidationArtifact(
68+
validation_status = status,
69+
valid_train_file_path = self.data_ingestion_artifact.train_file_path,
70+
valid_test_file_path = self.data_ingestion_artifact.test_file_path,
71+
invalid_train_file_path = None,
72+
invalid_test_file_path = None,
73+
drift_report_file_path = self.data_validation_config.drift_report_file_path
74+
)
75+
76+
return data_validation_artifact
77+
except Exception as e:
78+
raise NetworkSecurityException(e, sys)
79+
80+
def validate_number_of_columns(self, dataframe: pd.DataFrame) -> bool:
81+
try:
82+
num_of_cols = len(self._schema_config['columns'])
83+
logging.info(f"Required number of columns: {num_of_cols}")
84+
logging.info(f"Dataframe has columns: {dataframe.shape[1]}")
85+
return True if dataframe.shape[1] == num_of_cols else False
86+
except Exception as e:
87+
raise NetworkSecurityException(e, sys)
88+
89+
def validate_number_of_numeric_columns(self, dataframe: pd.DataFrame) -> bool:
90+
try:
91+
# Get expected numerical columns from schema
92+
numerical_columns = self._schema_config['numerical_columns']
93+
dataframe_columns = dataframe.columns.tolist()
94+
95+
# Check which numerical columns are present in the dataframe
96+
present_numerical_cols = [col for col in numerical_columns if col in dataframe_columns]
97+
missing_numerical_cols = [col for col in numerical_columns if col not in dataframe_columns]
98+
99+
logging.info(f"Required number of numerical columns: {len(numerical_columns)}")
100+
logging.info(f"Dataframe has numerical columns: {len(present_numerical_cols)}")
101+
102+
if missing_numerical_cols:
103+
logging.warning(f"Missing numerical columns: {missing_numerical_cols}")
104+
return False
105+
106+
return True
107+
108+
except Exception as e:
109+
raise NetworkSecurityException(e, sys)
110+
111+
def detect_data_drift(self,
112+
base_df: pd.DataFrame,
113+
current_df: pd.DataFrame,
114+
threshold: float = 0.05) -> bool:
115+
try:
116+
status = True
117+
report = {}
118+
for col in base_df.columns:
119+
d1 = base_df[col]
120+
d2 = current_df[col]
121+
122+
is_sample_distribution = ks_2samp(d1, d2)
123+
if threshold <= is_sample_distribution.pvalue:
124+
is_found = False
125+
else:
126+
is_found = True
127+
status = False
128+
129+
report.update({
130+
col: {
131+
"p_value": float(is_sample_distribution.pvalue),
132+
"drift_status": is_found
133+
}
134+
})
135+
136+
# creating directory for drift report file path
137+
drift_report_file_path = self.data_validation_config.drift_report_file_path
138+
dir_path = os.path.dirname(drift_report_file_path)
139+
os.makedirs(dir_path, exist_ok=True)
140+
141+
# writing to the yaml file
142+
write_yaml_file(
143+
file_path=drift_report_file_path,
144+
content=report
145+
)
146+
except Exception as e:
147+
raise NetworkSecurityException(e, sys)
Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,26 @@
1+
import os, sys
2+
13
"""
24
Training Pipeline Constants
35
46
This module contains all constants used throughout the training pipeline.
57
Constants are organized by functionality with descriptive prefixes.
68
"""
79

10+
11+
"""
12+
Defining common constant variables for training pipeline
13+
"""
14+
TARGET_COLUMN: str = "Result"
15+
PIPELINE_NAME: str = "NetworkSecurity"
16+
ARTIFACT_DIR: str = "Artifacts"
17+
FILE_NAME: str = "phishingData.csv"
18+
19+
TRAIN_FILE_NAME: str = "train.csv"
20+
TEST_FILE_NAME: str = "test.csv"
21+
22+
SCHEMA_FILE_PATH:str = os.path.join("data_schema", "schema.yaml")
23+
824

925
"""
1026
Data Ingestion related constants start with DATA_INGESTION_* prefix
@@ -18,12 +34,10 @@
1834

1935

2036
"""
21-
Defining common constant variables for training pipeline
37+
Defining constants for data validation
2238
"""
23-
TARGET_COLUMN: str = "Result"
24-
PIPELINE_NAME: str = "NetworkSecurity"
25-
ARTIFACT_DIR: str = "Artifacts"
26-
FILE_NAME: str = "phishingData.csv"
27-
28-
TRAIN_FILE_NAME: str = "train.csv"
29-
TEST_FILE_NAME: str = "test.csv"
39+
DATA_VALIDATION_DIR_NAME: str = "data_validation"
40+
DATA_VALIDATION_VALID_DIR: str = "validated"
41+
DATA_VALIDATION_INVALID_DIR: str = "invalid"
42+
DATA_VALIDATION_DRIFT_REPORT_DIR: str = "drift_report"
43+
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"

network_security/entity/artifact_entity.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,13 @@
55
@dataclass
66
class DataIngestionArtifact:
77
train_file_path: str
8-
test_file_path: str
8+
test_file_path: str
9+
10+
@dataclass
11+
class DataValidationArtifact:
12+
validation_status: bool
13+
valid_train_file_path: str
14+
valid_test_file_path: str
15+
invalid_train_file_path: str
16+
invalid_test_file_path: str
17+
drift_report_file_path: str

network_security/entity/config_entity.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,41 @@ def __init__(self, training_pipeline_config:TrainingPipelineConfig):
3636
)
3737
self.train_test_split_ratio: float = training_pipeline.DATA_INGESTION_TRAIN_TEST_SPLIT_RATION
3838
self.collection_name: str = training_pipeline.DATA_INGESTION_COLLECTION_NAME
39-
self.db_name: str = training_pipeline.DATA_INGESTION_DATABASE_NAME
39+
self.db_name: str = training_pipeline.DATA_INGESTION_DATABASE_NAME
40+
41+
42+
class DataValidationConfig:
43+
def __init__(self, training_pipeline_config:TrainingPipelineConfig):
44+
self.data_validation_dir: str = os.path.join(
45+
training_pipeline_config.artifact_dir,
46+
training_pipeline.DATA_VALIDATION_DIR_NAME,
47+
)
48+
self.valid_data_dir: str = os.path.join(
49+
self.data_validation_dir,
50+
training_pipeline.DATA_VALIDATION_VALID_DIR,
51+
)
52+
self.invalid_data_dir: str = os.path.join(
53+
self.data_validation_dir,
54+
training_pipeline.DATA_VALIDATION_INVALID_DIR,
55+
)
56+
self.valid_train_file_path: str = os.path.join(
57+
self.data_validation_dir,
58+
training_pipeline.TRAIN_FILE_NAME,
59+
)
60+
self.valid_test_file_path: str = os.path.join(
61+
self.data_validation_dir,
62+
training_pipeline.TEST_FILE_NAME,
63+
)
64+
self.invalid_train_file_path: str = os.path.join(
65+
self.data_validation_dir,
66+
self.invalid_data_dir, training_pipeline.TRAIN_FILE_NAME,
67+
)
68+
self.invalid_test_file_path: str = os.path.join(
69+
self.data_validation_dir,
70+
self.invalid_data_dir, training_pipeline.TEST_FILE_NAME,
71+
)
72+
self.drift_report_file_path: str = os.path.join(
73+
self.data_validation_dir,
74+
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
75+
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
76+
)

network_security/utils/main_utils/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)