Income_Prediction_Machine_Learning/data.py at master · SeanFitzpatrick0/Income_Prediction_Machine_Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

CLEAN_DATA_DIR = os.path.abspath('data/clean_data')
DATA_DIR = os.path.abspath('data')

target_encodings = {}


def get_target_mappings(df, target_column, encodeing_columns, mean_smoothing_weight=0.3):
    ''' Gets mappings for target encodings
        df: The training data.
        target_column: The name of the target column.
        encodeing_columns: List of column names to target encode.
        mean_smoothing_weight: Target encoding are the weighted average between the
            average target value for that class and the mean target value.
            (0 for no smoothing).
        Returns: A dictionary with the keys as column names and values as dictionaries for their amppings.
        (e.g. { 'Country' : { 'Ireland': X, ...}, ...})
    '''
    global target_encodings

    if not target_encodings:
        target_encodings[target_column] = df[target_column].mean()

        for column in encodeing_columns:
            # Create mappings
            category_mappings = ((df.groupby(column)[target_column].mean(
            ) * (1-mean_smoothing_weight)) + (target_encodings[target_column] * mean_smoothing_weight))

            # Add mappings
            target_encodings[column] = category_mappings

    return target_encodings


def get_clean_data(filepath, target_column, remove_columns=[], target_encode=[], one_hot_encode=[], standardize=True):
    ''' Cleans data
        filepath: Filepath to load data.
        target_column: The name of the target column.
        remove_columns: List of column names to remove.
        target_encode: List of column names to target encode.
        one_hot_encode: List of column names to one hot encode.
        standardize: Bool, standardizes data.
        Return: DataFrame containing cleaned data
    '''
    # Load data
    df = pd.read_csv(filepath)

    # Validate parameters
    if target_column not in df:
        raise ValueError('Target column ({}) in not a column in the dataset at ({})'.format(
            target_column, filepath))
    if target_column in remove_columns:
        raise ValueError(
            'You can\'t remove your target column ({})'.format(target_column))
    if target_column in target_encode:
        raise ValueError(
            'You can\'t endcode your target encode your target column ({})'.format(target_column))
    for column in remove_columns:
        if column in target_encode:
            raise ValueError(
                'You can\'t target encode a column you will remove ({})'.format(column))
        if column in one_hot_encode:
            raise ValueError(
                'You can\'t one hot encode a column you will remove ({})'.format(column))

    # Fix samples with missing values
    # Replace missing year & age with medians
    df['Year of Record'] = df['Year of Record'].fillna(
        df['Year of Record'].median())
    df['Age'] = df['Age'].fillna(
        df['Age'].median())
    df['Year of Record'] = df['Year of Record'].astype(int)
    df['Age'] = df['Age'].astype(int)

    # Fix gender values
    df['Gender'] = df['Gender'].replace(['0', 'unknown', np.NaN], 'other')

    # Fix university values
    df['University Degree'] = df['University Degree'].replace(
        ['0', np.NaN], 'No')

    # Fix hair color values
    df['Hair Color'] = df['Hair Color'].replace(['0', np.NaN], 'Unknown')

    # Target encode classes
    if target_encode:
        target_mappings = get_target_mappings(df, target_column, target_encode)

        for column in target_encode:
            df[column] = df[column].map(target_mappings[column]).fillna(
                target_mappings[target_column])

    # One hot encode class labels
    for column in one_hot_encode:
        df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
        df.drop([column], axis=1, inplace=True)

    # Remove un-wanted features
    for column in remove_columns:
        del df[column]

    # Standardize data
    if standardize:
        scaler = StandardScaler()
        scale_columns = df.columns.to_list()
        scale_columns.remove(target_column)
        df[scale_columns] = scaler.fit_transform(df[scale_columns])

    return df


def create_submission(model, submission_data, filepath=None):
    ''' Creates a submission csv file
        model: Model used to make predictions.
        submission_data: Dataframe with cleaned submission data.
        target_variable: The name of the target column.
        filepath: The filepath to save submission
    '''
    # make predictions
    target_variable = 'Income in EUR'
    X = submission_data.drop(columns=target_variable).values
    y_pred = model.predict(X)

    # create submission file
    submission_template_filepath = os.path.realpath(
        'data\\tcd ml 2019-20 income prediction submission file.csv')

    submission_save_filepath = filepath if filepath else os.path.realpath(
        'data\\submission\\submission.csv')

    submisssion_template = pd.read_csv(submission_template_filepath)
    submisssion_template['Income'] = pd.Series(y_pred.flatten())

    submisssion_template.to_csv(submission_save_filepath, index=False)
    print(f'Submission written to {submission_save_filepath}')


def remove_outliers(X, y, z_score_threshold=20):
    ''' Removes samples with y's greater than threshold
        X: numpy array of feature data.
        y: numpy array of target data.
        z_score_threshold
    '''
    z_scores = np.abs(stats.zscore(y))
    X = X[z_scores < z_score_threshold]
    y = y[z_scores < z_score_threshold]
    print(
        f'{len(z_scores[z_scores >= z_score_threshold])} outliers were removed.')
    return X, y


if __name__ == '__main__':
    print('--- Creating Clean Data ---')

    target_variable = 'Income in EUR'
    target_encode = ['Country', 'Profession']
    one_hot_encode = ['Gender', 'University Degree']
    remove_features = ['Instance', 'Hair Color', 'Wears Glasses']

    # Create clean training data
    training_data_filepath = os.path.join(
        DATA_DIR, 'tcd ml 2019-20 income prediction training (with labels).csv')
    df = get_clean_data(training_data_filepath, target_variable, remove_columns=remove_features,
                        target_encode=target_encode, one_hot_encode=one_hot_encode)
    clean_training_data_path = os.path.join(
        CLEAN_DATA_DIR, 'clean_training_data.csv')
    df.to_csv(clean_training_data_path)
    print('Clean training data written to: {}'.format(clean_training_data_path))

    # Create clean submission data
    submission_data_filepath = os.path.join(
        DATA_DIR, 'tcd ml 2019-20 income prediction test (without labels).csv')
    df = get_clean_data(submission_data_filepath, target_variable, remove_columns=remove_features,
                        target_encode=target_encode, one_hot_encode=one_hot_encode)
    clean_submission_data_path = os.path.join(
        CLEAN_DATA_DIR, 'clean_submission_data.csv')
    df.to_csv(clean_submission_data_path)
    print('Clean submission data written to: {}'.format(
        clean_submission_data_path))