Skip to content

Commit 3f01694

Browse files
committed
Added the config entity , artifact entity everything
1 parent 4e82514 commit 3f01694

File tree

4 files changed

+150
-29
lines changed

4 files changed

+150
-29
lines changed
Lines changed: 100 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,110 @@
1-
from networksecurity.entity.artifact_entity import DataValidationArtifact , DataIngestionArtifact
1+
from networksecurity.entity.artifact_entity import DataValidationArtifact, DataIngestionArtifact
22
from networksecurity.entity.config_entity import DataValidationConfig
33
from networksecurity.exception.exception import CustomException
4+
from networksecurity.utils.main_utils import read_yaml_file, write_yaml_file
45
from networksecurity.logger.customlogger import Custom_Logger
56
from networksecurity.constants.training_pipeline import SCHEMA_FILE_PATH
6-
from scipy.stats import ks_2samp
7+
8+
from evidently.report import Report
9+
from evidently.metric_preset import DataDriftPreset
10+
711
import pandas as pd
8-
import numpy as np
912
import os, sys
10-
import yaml
1113

1214
class DataValidation:
1315
def __init__(self, data_validation_config: DataValidationConfig, data_ingestion_artifact: DataIngestionArtifact):
14-
self.data_validation_config = data_validation_config
15-
self.data_ingestion_artifact = data_ingestion_artifact
16-
self._schema_config = read_yaml_file(SCHEMA_FILE_PATH)
17-
self.logger = Custom_Logger().get_logger()
16+
try:
17+
self.data_validation_config = data_validation_config
18+
self.data_ingestion_artifact = data_ingestion_artifact
19+
self._schema_config = read_yaml_file(SCHEMA_FILE_PATH)
20+
self.logger = Custom_Logger().get_logger()
21+
except Exception as e:
22+
raise CustomException(e, sys) from e
23+
24+
@staticmethod
25+
def read_data(file_path: str) -> pd.DataFrame:
26+
"""Reads data from a file and returns it as a DataFrame."""
27+
try:
28+
if not os.path.exists(file_path):
29+
raise FileNotFoundError(f"The file {file_path} does not exist.")
30+
return pd.read_csv(file_path)
31+
except Exception as e:
32+
raise CustomException(e, sys) from e
33+
34+
def validate_number_of_columns(self, dataframe: pd.DataFrame) -> bool:
35+
"""Validates the number of columns in the DataFrame against the schema."""
36+
try:
37+
expected_columns = self._schema_config['columns']
38+
actual_columns = dataframe.columns.tolist()
39+
40+
self.logger.info(f"Expected columns: {expected_columns}")
41+
self.logger.info(f"Actual columns: {actual_columns}")
42+
43+
if len(actual_columns) != len(expected_columns):
44+
self.logger.critical(f"❌ Column mismatch: Expected {len(expected_columns)}, got {len(actual_columns)}")
45+
return False
46+
return True
47+
except Exception as e:
48+
raise CustomException(e, sys) from e
49+
50+
def get_data_drift_report(self, train_df: pd.DataFrame, test_df: pd.DataFrame, html_path: str, yaml_path: str = None):
51+
"""Generates a data drift report using Evidently (HTML + optional YAML)."""
52+
try:
53+
self.logger.info("📊 Generating data drift report...")
54+
55+
report = Report(metrics=[DataDriftPreset()])
56+
report.run(reference_data=train_df, current_data=test_df)
57+
58+
report.save_html(html_path)
59+
self.logger.info(f"✅ Drift report saved: {html_path}")
60+
61+
# If YAML path is given, save the summary
62+
if yaml_path:
63+
result_dict = report.as_dict()
64+
drift_result = result_dict["metrics"][0]["result"]
65+
66+
summary = {
67+
"drift_detected": drift_result["dataset_drift"],
68+
"drifted_feature_count": drift_result["number_of_drifted_columns"],
69+
"total_feature_count": drift_result["number_of_columns"],
70+
"drift_share": drift_result["share_of_drifted_columns"],
71+
"drifted_features": drift_result.get("drifted_columns", [])
72+
}
73+
74+
write_yaml_file(yaml_path, summary)
75+
self.logger.info(f"📄 Drift summary saved to YAML: {yaml_path}")
76+
77+
except Exception as e:
78+
self.logger.error("❌ Drift report generation failed.")
79+
raise CustomException(e, sys) from e
80+
81+
def initiate_data_validation(self) -> DataValidationArtifact:
82+
"""Runs the complete data validation pipeline."""
83+
try:
84+
train_file_path = self.data_ingestion_artifact.train_file_path
85+
test_file_path = self.data_ingestion_artifact.test_file_path
86+
87+
train_df = self.read_data(train_file_path)
88+
test_df = self.read_data(test_file_path)
89+
self.logger.info("✅ Train and test data loaded successfully.")
90+
91+
# Validate schema for both
92+
if not self.validate_number_of_columns(train_df):
93+
raise ValueError("Train data schema mismatch.")
94+
if not self.validate_number_of_columns(test_df):
95+
raise ValueError("Test data schema mismatch.")
96+
97+
# Drift Report (HTML + YAML)
98+
drift_html = self.data_validation_config.drift_report_path
99+
drift_yaml = self.data_validation_config.drift_yaml_path
100+
101+
self.get_data_drift_report(train_df, test_df, drift_html, drift_yaml)
102+
103+
return DataValidationArtifact(
104+
validation_status=True,
105+
drift_report_file_path=drift_html,
106+
drift_summary_file_path=drift_yaml
107+
)
18108

19-
109+
except Exception as e:
110+
raise CustomException(e, sys) from e

networksecurity/entity/artifact_entity.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22

33
@dataclass
44
class DataIngestionArtifact:
5-
trained_file_path:str
6-
test_file_path:str
5+
trained_file_path: str
6+
test_file_path: str
77

88
@dataclass
99
class DataValidationArtifact:
10-
vaildation_status:bool
11-
valid_train_file_path:str
12-
valid_test_file_path:str
13-
invalid_train_file_path:str
14-
invalid_test_file_path:str
15-
drift_report_file_path:str
10+
validation_status: bool
11+
valid_train_file_path: str
12+
valid_test_file_path: str
13+
invalid_train_file_path: str
14+
invalid_test_file_path: str
15+
drift_report_file_path: str
16+
drift_summary_file_path: str

networksecurity/entity/config_entity.py

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,18 +37,46 @@ class DataValidationConfigEntity:
3737

3838
def __post_init__(self):
3939
self.data_validation_dir: str = os.path.join(
40-
self.training_pipeline_config.artifact_dir, training_pipeline.DATA_VALIDATION_DIR_NAME
41-
)
42-
self.valid_data_dir: str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_VALID_DIR)
43-
self.invalid_data_dir: str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_INVALID_DIR)
44-
self.valid_train_file_path : str = os.path.join(self.valid_data_dir, training_pipeline.TRAIN_FILE_NAME)
45-
self.valid_test_file_path : str = os.path.join(self.valid_data_dir, training_pipeline.TEST_FILE_NAME)
46-
self.invalid_train_file_path : str = os.path.join(self.invalid_data_dir, training_pipeline.TRAIN_FILE_NAME)
47-
self.invalid_test_file_path : str = os.path.join(self.invalid_data_dir, training_pipeline.TEST_FILE_NAME)
48-
# New: Drift report paths
49-
self.drift_report_file_path : str = os.path.join(
40+
self.training_pipeline_config.artifact_dir,
41+
training_pipeline.DATA_VALIDATION_DIR_NAME
42+
)
43+
44+
self.valid_data_dir: str = os.path.join(
45+
self.data_validation_dir,
46+
training_pipeline.DATA_VALIDATION_VALID_DIR
47+
)
48+
self.invalid_data_dir: str = os.path.join(
49+
self.data_validation_dir,
50+
training_pipeline.DATA_VALIDATION_INVALID_DIR
51+
)
52+
53+
self.valid_train_file_path: str = os.path.join(
54+
self.valid_data_dir,
55+
training_pipeline.TRAIN_FILE_NAME
56+
)
57+
self.valid_test_file_path: str = os.path.join(
58+
self.valid_data_dir,
59+
training_pipeline.TEST_FILE_NAME
60+
)
61+
self.invalid_train_file_path: str = os.path.join(
62+
self.invalid_data_dir,
63+
training_pipeline.TRAIN_FILE_NAME
64+
)
65+
self.invalid_test_file_path: str = os.path.join(
66+
self.invalid_data_dir,
67+
training_pipeline.TEST_FILE_NAME
68+
)
69+
70+
# Drift Report HTML path
71+
self.drift_report_file_path: str = os.path.join(
5072
self.data_validation_dir,
51-
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
73+
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
5274
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME
5375
)
54-
76+
77+
# ✅ Drift Report YAML Summary Path (NEW)
78+
self.drift_yaml_path: str = os.path.join(
79+
self.data_validation_dir,
80+
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
81+
training_pipeline.DATA_VALIDATION_DRIFT_SUMMARY_FILE_NAME
82+
)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ numpy==1.26
55
scikit-learn
66
matplotlib
77
seaborn
8+
evidently
89
setuptools
910
scipy
1011
dvc

0 commit comments

Comments
 (0)