1- from networksecurity .entity .artifact_entity import DataValidationArtifact , DataIngestionArtifact
1+ from networksecurity .entity .artifact_entity import DataValidationArtifact , DataIngestionArtifact
22from networksecurity .entity .config_entity import DataValidationConfig
33from networksecurity .exception .exception import CustomException
4+ from networksecurity .utils .main_utils import read_yaml_file , write_yaml_file
45from networksecurity .logger .customlogger import Custom_Logger
56from networksecurity .constants .training_pipeline import SCHEMA_FILE_PATH
6- from scipy .stats import ks_2samp
7+
8+ from evidently .report import Report
9+ from evidently .metric_preset import DataDriftPreset
10+
711import pandas as pd
8- import numpy as np
912import os , sys
10- import yaml
1113
1214class DataValidation :
1315 def __init__ (self , data_validation_config : DataValidationConfig , data_ingestion_artifact : DataIngestionArtifact ):
14- self .data_validation_config = data_validation_config
15- self .data_ingestion_artifact = data_ingestion_artifact
16- self ._schema_config = read_yaml_file (SCHEMA_FILE_PATH )
17- self .logger = Custom_Logger ().get_logger ()
16+ try :
17+ self .data_validation_config = data_validation_config
18+ self .data_ingestion_artifact = data_ingestion_artifact
19+ self ._schema_config = read_yaml_file (SCHEMA_FILE_PATH )
20+ self .logger = Custom_Logger ().get_logger ()
21+ except Exception as e :
22+ raise CustomException (e , sys ) from e
23+
24+ @staticmethod
25+ def read_data (file_path : str ) -> pd .DataFrame :
26+ """Reads data from a file and returns it as a DataFrame."""
27+ try :
28+ if not os .path .exists (file_path ):
29+ raise FileNotFoundError (f"The file { file_path } does not exist." )
30+ return pd .read_csv (file_path )
31+ except Exception as e :
32+ raise CustomException (e , sys ) from e
33+
34+ def validate_number_of_columns (self , dataframe : pd .DataFrame ) -> bool :
35+ """Validates the number of columns in the DataFrame against the schema."""
36+ try :
37+ expected_columns = self ._schema_config ['columns' ]
38+ actual_columns = dataframe .columns .tolist ()
39+
40+ self .logger .info (f"Expected columns: { expected_columns } " )
41+ self .logger .info (f"Actual columns: { actual_columns } " )
42+
43+ if len (actual_columns ) != len (expected_columns ):
44+ self .logger .critical (f"❌ Column mismatch: Expected { len (expected_columns )} , got { len (actual_columns )} " )
45+ return False
46+ return True
47+ except Exception as e :
48+ raise CustomException (e , sys ) from e
49+
50+ def get_data_drift_report (self , train_df : pd .DataFrame , test_df : pd .DataFrame , html_path : str , yaml_path : str = None ):
51+ """Generates a data drift report using Evidently (HTML + optional YAML)."""
52+ try :
53+ self .logger .info ("📊 Generating data drift report..." )
54+
55+ report = Report (metrics = [DataDriftPreset ()])
56+ report .run (reference_data = train_df , current_data = test_df )
57+
58+ report .save_html (html_path )
59+ self .logger .info (f"✅ Drift report saved: { html_path } " )
60+
61+ # If YAML path is given, save the summary
62+ if yaml_path :
63+ result_dict = report .as_dict ()
64+ drift_result = result_dict ["metrics" ][0 ]["result" ]
65+
66+ summary = {
67+ "drift_detected" : drift_result ["dataset_drift" ],
68+ "drifted_feature_count" : drift_result ["number_of_drifted_columns" ],
69+ "total_feature_count" : drift_result ["number_of_columns" ],
70+ "drift_share" : drift_result ["share_of_drifted_columns" ],
71+ "drifted_features" : drift_result .get ("drifted_columns" , [])
72+ }
73+
74+ write_yaml_file (yaml_path , summary )
75+ self .logger .info (f"📄 Drift summary saved to YAML: { yaml_path } " )
76+
77+ except Exception as e :
78+ self .logger .error ("❌ Drift report generation failed." )
79+ raise CustomException (e , sys ) from e
80+
81+ def initiate_data_validation (self ) -> DataValidationArtifact :
82+ """Runs the complete data validation pipeline."""
83+ try :
84+ train_file_path = self .data_ingestion_artifact .train_file_path
85+ test_file_path = self .data_ingestion_artifact .test_file_path
86+
87+ train_df = self .read_data (train_file_path )
88+ test_df = self .read_data (test_file_path )
89+ self .logger .info ("✅ Train and test data loaded successfully." )
90+
91+ # Validate schema for both
92+ if not self .validate_number_of_columns (train_df ):
93+ raise ValueError ("Train data schema mismatch." )
94+ if not self .validate_number_of_columns (test_df ):
95+ raise ValueError ("Test data schema mismatch." )
96+
97+ # Drift Report (HTML + YAML)
98+ drift_html = self .data_validation_config .drift_report_path
99+ drift_yaml = self .data_validation_config .drift_yaml_path
100+
101+ self .get_data_drift_report (train_df , test_df , drift_html , drift_yaml )
102+
103+ return DataValidationArtifact (
104+ validation_status = True ,
105+ drift_report_file_path = drift_html ,
106+ drift_summary_file_path = drift_yaml
107+ )
18108
19-
109+ except Exception as e :
110+ raise CustomException (e , sys ) from e
0 commit comments