diff --git a/json/dell.json b/json/dell.json new file mode 100644 index 0000000..4e5eb6e --- /dev/null +++ b/json/dell.json @@ -0,0 +1,424 @@ +{ + "meta": { + "company": "Dell", + "year": 2010 + }, + "section_a": { "reporting_type": "Consolidated Report" }, + "section_b": { + "parent_company": "DELL INC - (DELL USA) - ROUND ROCK", + "address": { + "street_address": "401 DELL WAY", + "city": "ROUND ROCK", + "state": "TX", + "zip_code": "78682" + }, + "previous_year_report": true + }, + "section_c": { + "at_least_100": true, + "affiliated": false, + "DUNS": true + }, + "section_d": [ + { + "job_category": "Executive/Senior Level Officials and Managers", + "data": [ + { + "race": "Hispanic or Latino", + "men": 0, + "women": 0 + }, + { + "race": "White", + "men": 98, + "women": 28 + }, + { + "race": "Black or African American", + "men": 1, + "women": 0 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 0, + "women": 0 + }, + { + "race": "Asian", + "men": 8, + "women": 2 + }, + { + "race": "American Indian or Alaska Native", + "men": 0, + "women": 0 + }, + { + "race": "Two or More Races", + "men": 0, + "women": 0 + } + ] + }, + { + "job_category": "First/Mid Level Officials and Managers", + "data": [ + { + "race": "Hispanic or Latino", + "men": 170, + "women": 90 + }, + { + "race": "White", + "men": 2604, + "women": 975 + }, + { + "race": "Black or African American", + "men": 132, + "women": 78 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 1, + "women": 1 + }, + { + "race": "Asian", + "men": 244, + "women": 78 + }, + { + "race": "American Indian or Alaska Native", + "men": 8, + "women": 6 + }, + { + "race": "Two or More Races", + "men": 16, + "women": 3 + } + ] + }, + { + "job_category": "Professionals", + "data": [ + { + "race": "Hispanic or Latino", + "men": 904, + "women": 482 + }, + { + "race": "White", + "men": 8164, + "women": 3562 + }, + { + "race": "Black or African American", + "men": 683, + "women": 434 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 17, + "women": 7 + }, + { + "race": "Asian", + "men": 1884, + "women": 780 + }, + { + "race": "American Indian or Alaska Native", + "men": 51, + "women": 19 + }, + { + "race": "Two or More Races", + "men": 98, + "women": 41 + } + ] + }, + { + "job_category": "Technicians", + "data": [ + { + "race": "Hispanic or Latino", + "men": 311, + "women": 57 + }, + { + "race": "White", + "men": 2186, + "women": 303 + }, + { + "race": "Black or African American", + "men": 341, + "women": 104 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 7, + "women": 0 + }, + { + "race": "Asian", + "men": 201, + "women": 29 + }, + { + "race": "American Indian or Alaska Native", + "men": 33, + "women": 10 + }, + { + "race": "Two or More Races", + "men": 49, + "women": 7 + } + ] + }, + { + "job_category": "Sales Workers", + "data": [ + { + "race": "Hispanic or Latino", + "men": 287, + "women": 80 + }, + { + "race": "White", + "men": 3318, + "women": 964 + }, + { + "race": "Black or African American", + "men": 226, + "women": 71 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 6, + "women": 1 + }, + { + "race": "Asian", + "men": 145, + "women": 39 + }, + { + "race": "American Indian or Alaska Native", + "men": 24, + "women": 9 + }, + { + "race": "Two or More Races", + "men": 47, + "women": 17 + } + ] + }, + { + "job_category": "Administrative Support Workers", + "data": [ + { + "race": "Hispanic or Latino", + "men": 232, + "women": 470 + }, + { + "race": "White", + "men": 720, + "women": 2203 + }, + { + "race": "Black or African American", + "men": 192, + "women": 609 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 2, + "women": 17 + }, + { + "race": "Asian", + "men": 81, + "women": 180 + }, + { + "race": "American Indian or Alaska Native", + "men": 11, + "women": 36 + }, + { + "race": "Two or More Races", + "men": 9, + "women": 31 + } + ] + }, + { + "job_category": "Craft Workers", + "data": [ + { + "race": "Hispanic or Latino", + "men": 1, + "women": 0 + }, + { + "race": "White", + "men": 0, + "women": 0 + }, + { + "race": "Black or African American", + "men": 0, + "women": 1 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 0, + "women": 0 + }, + { + "race": "Asian", + "men": 0, + "women": 0 + }, + { + "race": "American Indian or Alaska Native", + "men": 0, + "women": 0 + }, + { + "race": "Two or More Races", + "men": 0, + "women": 0 + } + ] + }, + { + "job_category": "Operatives", + "data": [ + { + "race": "Hispanic or Latino", + "men": 75, + "women": 40 + }, + { + "race": "White", + "men": 104, + "women": 67 + }, + { + "race": "Black or African American", + "men": 79, + "women": 88 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 0, + "women": 0 + }, + { + "race": "Asian", + "men": 28, + "women": 15 + }, + { + "race": "American Indian or Alaska Native", + "men": 1, + "women": 0 + }, + { + "race": "Two or More Races", + "men": 0, + "women": 0 + } + ] + }, + { + "job_category": "Laborers and Helpers", + "data": [ + { + "race": "Hispanic or Latino", + "men": 0, + "women": 0 + }, + { + "race": "White", + "men": 0, + "women": 0 + }, + { + "race": "Black or African American", + "men": 0, + "women": 0 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 0, + "women": 0 + }, + { + "race": "Asian", + "men": 0, + "women": 0 + }, + { + "race": "American Indian or Alaska Native", + "men": 0, + "women": 0 + }, + { + "race": "Two or More Races", + "men": 0, + "women": 0 + } + ] + }, + { + "job_category": "Service Workers", + "data": [ + { + "race": "Hispanic or Latino", + "men": 6, + "women": 9 + }, + { + "race": "White", + "men": 45, + "women": 65 + }, + { + "race": "Black or African American", + "men": 7, + "women": 15 + }, + { + "race": "Native Hawaiian or Other Pacific Islander", + "men": 0, + "women": 0 + }, + { + "race": "Asian", + "men": 2, + "women": 1 + }, + { + "race": "American Indian or Alaska Native", + "men": 0, + "women": 0 + }, + { + "race": "Two or More Races", + "men": 1, + "women": 0 + } + ] + } + ] +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b2ddf72 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +jsonschema==2.3.0 diff --git a/schema/README.md b/schema/README.md new file mode 100644 index 0000000..6f41ecd --- /dev/null +++ b/schema/README.md @@ -0,0 +1,16 @@ +OpenDiversityData.org JSON Schema +================= + +This directory contains a [JSON Schema](http://json-schema.org/ 'JSON Schema Homepage') for the U.S. government's [EEO-1 Survey](http://www.eeoc.gov/employers/eeo1survey/ 'EEO-1 Survey Homepage'). + +Unfortunately, JSON Schema syntax does not allow comments. Some explanation of choices made in the schema are contained instead within this document. + +At the highest level, there are two required properties of the JSON object composing the data for a single EEO-1 report: the `meta` property and the `section_d` property. Other than the `meta` property, every other property - i.e., those beginning with `section_` describes a section of an EEO-1 report, as represented in the [sample EEO-1 form](http://www.eeoc.gov/employers/eeo1survey/upload/eeo1-2.pdf 'Sample EEO-1 Reporting Form') linked in [the instructions for EEO-1 reporting](http://www.eeoc.gov/employers/eeo1survey/2007instructions.cfm 'Instructions for EEO-1 Reporting') and as represented in the PDFs that many companies release as the public version of their EEO-1 report, as for example [Dell's](http://money.cnn.com/technology/storysupplement/diversity-data/dell-2010-EEO-1.html 'Dell's 2010 EEO-1 Report') (here embedded in the webpage as an image, which is unfortunately quite common). + +Sections A, B, and C of the standard EEO-1 report are not required in the schema because they don't contain the data of interest, and it's possible that some presentations of publicly released EEO-1 data won't include them. + +The `section_d` property of the report object contains the data of interest. It describes an array of exactly ten objects because there are ten job categories across which companies must report diversity statistics. The categories themselves are enumerated as possible values of the `job_category` property of each array object. The `data` for each object in the array is itself an array, where each array item is an object containing three fields: `race`, which is a string value, limited to the seven categories defined for the EEO-1 Survey); `men` and `women` as integer values with an minimum of zero. The array value of `data` must contain seven items, since there are seven possible categories of `race`. + +### Caveats + +In its present form, the schema requires ten objects because there are ten job categories in an EEO-1 report and seven objects within each job category's data array because there are seven races for reporting under EEO-1. In both of these data arrays, each object must be unique within the array, but it would still be possible to submit ill-formed data if a job category label is repeated (but one of the other ten is missing) but the data for the (duplicate) category is different. A much more verbose schema would be required to guard against this case, so we are opting to take on the risk of such ill-formed data (at least for now). diff --git a/schema/eeo-schema.json b/schema/eeo-schema.json new file mode 100644 index 0000000..38fd805 --- /dev/null +++ b/schema/eeo-schema.json @@ -0,0 +1,132 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "opendiversitydata", + "version": "0.0.2", + "description": "JSON schema for U.S. EEO-1 survey data", + "type": "object", + "properties": { + "meta": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "year": { + "type": "integer", + "minimum": 1965, + "maximum": 2014 + } + }, + "required": ["company", "year"] + }, + "section_a": { + "type": "object", + "properties": { + "reporting_type": { + "type": "string", + "enum": [ + "Single-establishement Employer Report", + "Consolidated Report", + "Headquarters Unit Report", + "Individual Establishment Report", + "Special Report" + ] + } + }, + "required": ["reporting_type"] + }, + "section_b": { + "type": "object", + "properties": { + "parent_company": { + "type": "string" + }, + "address": { + "type": "object", + "properties": { + "street_address": { "type": "string" }, + "city": { "type": "string" }, + "state": { + "type": "string", + "enum": ["AL", "AK", "AS", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FM", "FL", "GA", "GU", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MH", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "MP", "OH", "OK", "OR", "PW", "PA", "PR", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VI", "VA", "WA", "WV", "WI", "WY"] + }, + "zip_code": { "type": "string" } + }, + "required": ["street_address", "city", "state", "zip_code"] + }, + "previous_year_report": { + "type": "boolean" + } + }, + "required": ["parent_company", "address", "previous_year_report"] + }, + "section_c": { + "type": "object", + "properties": { + "at_least_100": { "type": "boolean" }, + "affiliated": { "type": "boolean" }, + "DUNS": { "type": "boolean" } + }, + "required": ["at_least_100", "affiliated", "DUNS"] + }, + "section_d": { + "type": "array", + "items": { + "type": "object", + "properties": { + "job_category": { + "type": "string", + "enum": [ + "Executive/Senior Level Officials and Managers", + "First/Mid Level Officials and Managers", + "Professionals", + "Technicians", + "Sales Workers", + "Administrative Support Workers", + "Craft Workers", + "Operatives", + "Laborers and Helpers", + "Service Workers" + ] + }, + "data": { + "type": "array", + "items": { + "properties": { + "race": { + "type": "string", + "enum": [ + "Hispanic or Latino", + "White", + "Black or African American", + "Native Hawaiian or Other Pacific Islander", + "Asian", + "American Indian or Alaska Native", + "Two or More Races" + ] + }, + "men": { + "type": "integer", + "minimum": 0 + }, + "women": { + "type": "integer", + "minimum": 0 + } + }, + "required": ["race", "men", "women"] + }, + "minItems": 7, + "maxItems": 7, + "uniqueItems": true + } + }, + "required": ["job_category", "data"] + }, + "minItems": 10, + "maxItems": 10, + "uniqueItems": true + } + }, + "required": ["meta", "section_d"] +} diff --git a/schema/validate.py b/schema/validate.py new file mode 100755 index 0000000..30c5d27 --- /dev/null +++ b/schema/validate.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# usage: validate.py [-h] [-s SCHEMA] [-d DATA] +# +# Validate a JSON data file against a specified JSON Schema. +# +# optional arguments: +# -h, --help show this help message and exit +# -s SCHEMA, --schema SCHEMA +# name of JSON Schema file +# -d DATA, --data DATA name of JSON data file +# +# NB: when validation fails, output is much easier to read if you're using Python 3.* +# but in general this works in Python 2 as well + +from __future__ import print_function + +import json +import jsonschema +import argparse + +def main(): + parser = argparse.ArgumentParser(description='Validate a JSON data file against a specified JSON Schema.') + parser.add_argument('-s', '--schema', action='store', dest='schema', help='name of JSON Schema file') + parser.add_argument('-d', '--data', action='store', dest='data', help='name of JSON data file') + args = parser.parse_args() + + if not args.data: + print() + print("You need to pass me a data file to validate, teapot. >:(") + print() + exit() + if not args.schema: + print() + print("You need to pass me a schema to use for validation, teapot. >:(") + print() + exit() + + data = json.load(open(args.data, 'rU')) + + schema = json.load(open(args.schema, 'rU')) + + jsonschema.validate(data, schema) + +if __name__ == '__main__': + main() diff --git a/toolkit/check_totals.py b/toolkit/check_totals.py new file mode 100755 index 0000000..de2eba5 --- /dev/null +++ b/toolkit/check_totals.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# usage: check_totals.py [-h] json +# +# Print the aggregates from an Open Diversity Data JSON file to check the totals +# against the PDF. +# +# positional arguments: +# json Open Diversity Data JSON file to check totals on +# +# optional arguments: +# -h, --help show this help message and exit + +from __future__ import print_function + +import json +import argparse + +def main(): + parser = argparse.ArgumentParser(description='Print the aggregates from an Open Diversity Data JSON file to check the totals against the PDF.') + parser.add_argument('json', action='store', help='Open Diversity Data JSON file to check totals on') + args = parser.parse_args() + + if not args.json: + print() + print("Sorry, you need to provide some data for me to aggregate :(") + print() + exit() + + jsondata = json.load(open(args.json, 'rU'))['section_d'] + + job_categories = [ + "Executive/Senior Level Officials and Managers", + "First/Mid Level Officials and Managers", + "Professionals", + "Technicians", + "Sales Workers", + "Administrative Support Workers", + "Craft Workers", + "Operatives", + "Laborers and Helpers", + "Service Workers" + ] + + races = [ + "Hispanic or Latino", + "White", + "Black or African American", + "Native Hawaiian or Other Pacific Islander", + "Asian", + "American Indian or Alaska Native", + "Two or More Races" + ] + + overall_totals = {} + race_totals = {} + + for cat in job_categories: + data = None + for obj in jsondata: + if obj['job_category'] == cat: + data = obj['data'] + if data: + total = 0 + for datum in data: + total += datum['men'] + datum['women'] + overall_totals[cat] = total + + for race in races: + race_data = None + for inner_obj in data: + if inner_obj['race'] == race: + race_data = inner_obj + try: + current = race_totals[race] + current['men'] += race_data['men'] + current['women'] += race_data['women'] + except KeyError: + race_totals[race] = {'men': race_data['men'], 'women': race_data['women']} + + + print() + print("### Overall Totals ###") + for cat in job_categories: + print("%s: %d" %(cat, overall_totals[cat])) + print("Absolute total: %d" %sum([overall_totals[cat] for cat in job_categories])) + print() + print("### Totals by Race and Gender ###") + hispanic = races.pop(0) + print(hispanic, ': Men %d, Women %d' %(race_totals[hispanic]['men'], race_totals[hispanic]['women'])) + print("Non-Hispanic or Latino, Men:") + for race in races: + print("\t%s: %d" %(race, race_totals[race]['men'])) + print("Non-Hispanic or Latino, Women:") + for race in races: + print("\t%s: %d" %(race, race_totals[race]['women'])) + print() + +if __name__ == '__main__': + main()