|  | 
|  | 1 | +''' | 
|  | 2 | +Filename: excel_anonymizer.py | 
|  | 3 | +Author: Siddharth Bhatia | 
|  | 4 | +''' | 
|  | 5 | + | 
|  | 6 | +import argparse | 
|  | 7 | +import logging | 
|  | 8 | +import logging.config | 
|  | 9 | + | 
|  | 10 | +import pandas as pd | 
|  | 11 | +from presidio_analyzer import AnalyzerEngine | 
|  | 12 | +from presidio_anonymizer import AnonymizerEngine | 
|  | 13 | +from presidio_anonymizer.entities.engine import OperatorConfig | 
|  | 14 | +from faker import Faker | 
|  | 15 | + | 
|  | 16 | +def main(): | 
|  | 17 | +    """Just a main function needed to publish this to PyPI""" | 
|  | 18 | + | 
|  | 19 | +    # Disable loggers from all imported modules | 
|  | 20 | +    logging.config.dictConfig({ | 
|  | 21 | +        'version': 1, | 
|  | 22 | +        'disable_existing_loggers': True, | 
|  | 23 | +    }) | 
|  | 24 | + | 
|  | 25 | +    # Initialize parser | 
|  | 26 | +    parser = argparse.ArgumentParser( | 
|  | 27 | +                        prog='excel_anonymizer.py', | 
|  | 28 | +                        description='Anonymizes an Excel file and \ | 
|  | 29 | +                            synthesizes new data in its place.', | 
|  | 30 | +                        epilog='Made by Siddharth Bhatia') | 
|  | 31 | + | 
|  | 32 | +    # Take file as input | 
|  | 33 | +    parser.add_argument('filename', help="your excel file here") | 
|  | 34 | +    parser.add_argument('-v', '--verbose', | 
|  | 35 | +                        action='store_true') | 
|  | 36 | + | 
|  | 37 | +    # Read arguments from command line | 
|  | 38 | +    args = parser.parse_args() | 
|  | 39 | + | 
|  | 40 | +    filename = args.filename | 
|  | 41 | + | 
|  | 42 | +    if args.verbose is True: | 
|  | 43 | +        logging.basicConfig(format="%(message)s", level=logging.INFO) | 
|  | 44 | +        logging.info("Verbose output.") | 
|  | 45 | + | 
|  | 46 | +    def log(string): | 
|  | 47 | +        """Make function for logging.""" | 
|  | 48 | +        if args.verbose is True: | 
|  | 49 | +            logging.info(string) | 
|  | 50 | + | 
|  | 51 | +    df = pd.read_excel(f"{filename}") | 
|  | 52 | +    log(df) | 
|  | 53 | +    log("") | 
|  | 54 | + | 
|  | 55 | +    # Column values to list, which I will use at the end | 
|  | 56 | +    columns_ordered_list = df.columns.values.tolist() | 
|  | 57 | +    log(f"Columns: {columns_ordered_list}") | 
|  | 58 | +    log("") | 
|  | 59 | + | 
|  | 60 | +    # Initialize an empty dictionary to store cell locations and values | 
|  | 61 | +    cell_data = {} | 
|  | 62 | + | 
|  | 63 | +    # Iterate over every cell | 
|  | 64 | +    for index, row in df.iterrows(): | 
|  | 65 | +        for column in df.columns: | 
|  | 66 | +            cell_value = row[column] | 
|  | 67 | +            cell_location = (index, column) | 
|  | 68 | +            cell_data[cell_location] = cell_value | 
|  | 69 | + | 
|  | 70 | +    # log the list of cell values | 
|  | 71 | +    log(f"Cell Data: {cell_data}") | 
|  | 72 | +    log("") | 
|  | 73 | +    log("###") | 
|  | 74 | + | 
|  | 75 | +    # Presidio code begins here | 
|  | 76 | +    analyzer = AnalyzerEngine() | 
|  | 77 | +    anonymizer = AnonymizerEngine() | 
|  | 78 | + | 
|  | 79 | +    # Faker code begins here | 
|  | 80 | +    fake = Faker() | 
|  | 81 | + | 
|  | 82 | +    # Faker Custom Operators | 
|  | 83 | +    fake_operators = { | 
|  | 84 | +        "PERSON": OperatorConfig("custom", {"lambda": lambda x: fake.name()}), | 
|  | 85 | +        "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda x: fake.phone_number()}), | 
|  | 86 | +        "LOCATION": OperatorConfig("custom", {"lambda": lambda x: str(fake.country())}), | 
|  | 87 | +        "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.email()}), | 
|  | 88 | +        "DATE_TIME": OperatorConfig("custom", {"lambda": lambda x: str(fake.date_time())}), | 
|  | 89 | +        "CREDIT_CARD": OperatorConfig("custom", {"lambda": lambda x: fake.credit_card_number()}), | 
|  | 90 | +        "US_BANK_NUMBER": OperatorConfig("custom", {"lambda": lambda x: fake.credit_card_number()}), | 
|  | 91 | +        #"DEFAULT": OperatorConfig(operator_name="mask", | 
|  | 92 | +        #                          params={'chars_to_mask': 10, | 
|  | 93 | +        #                                  'masking_char': '*', | 
|  | 94 | +        #                                  'from_end': False}), | 
|  | 95 | +    } | 
|  | 96 | + | 
|  | 97 | +    fake = Faker(locale="en_IN") | 
|  | 98 | + | 
|  | 99 | +    for location, entity in cell_data.items(): | 
|  | 100 | +        # log every cell with it's location | 
|  | 101 | +        # log(cell, cell_data[cell]) | 
|  | 102 | +        log(entity) | 
|  | 103 | + | 
|  | 104 | +        # Analyze + anonymize it | 
|  | 105 | +        analyzer_results = analyzer.analyze(text=str(entity), language="en") | 
|  | 106 | +        log(analyzer_results) | 
|  | 107 | + | 
|  | 108 | +        anonymized_results = anonymizer.anonymize( | 
|  | 109 | +            text=str(entity), | 
|  | 110 | +            analyzer_results=analyzer_results, | 
|  | 111 | +            operators=fake_operators, | 
|  | 112 | +        ) | 
|  | 113 | + | 
|  | 114 | +        log(f"text: {anonymized_results.text}") | 
|  | 115 | +        log("") | 
|  | 116 | +        # then return it to the dictionary | 
|  | 117 | +        cell_data[location] = anonymized_results.text | 
|  | 118 | +    log("---") | 
|  | 119 | + | 
|  | 120 | +    # log(cell_data) | 
|  | 121 | +    # OUTPUT: {(0, 'Name'): '<PERSON>', (0, 'Phone Number'): '<PHONE_NUMBER>', | 
|  | 122 | +    #         (1, 'Name'): '<PERSON>', (1, 'Phone Number'): '<PHONE_NUMBER>'} | 
|  | 123 | + | 
|  | 124 | +    data = {} | 
|  | 125 | +    columns = list(set(column for _, column in cell_data)) | 
|  | 126 | +    for (index, column), value in cell_data.items(): | 
|  | 127 | +        data.setdefault(index, [None] * len(columns)) | 
|  | 128 | +        data[index][columns_ordered_list.index(column)] = value | 
|  | 129 | +    anonymized_df = pd.DataFrame.from_dict(data, columns=columns_ordered_list, orient="index") | 
|  | 130 | +    log(anonymized_df) | 
|  | 131 | + | 
|  | 132 | +    filename = filename.rstrip(".xlsx") | 
|  | 133 | +    anonymized_df.to_excel( | 
|  | 134 | +        f"{filename}-anonymized.xlsx", | 
|  | 135 | +        # Don't save the auto-generated numeric index | 
|  | 136 | +        index=False | 
|  | 137 | +    ) | 
|  | 138 | + | 
|  | 139 | +    print(f"Output generated: {filename}-anonymized.xlsx") | 
|  | 140 | + | 
|  | 141 | +if __name__ == "__main__": | 
|  | 142 | +    main() | 
0 commit comments