diff --git a/README.md b/README.md index c5e5ca0..d1a2131 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,13 @@ counties = county_geo(2024) states = state_geo(2024) ``` -County and county-age population counts +State and county population data, stratified by several different variables: ```python -from kintsugi.county_pop import county_pop, county_age_pop +from kintsugi.population import county_pop, state_age_pop lf_county_pop = county_pop(2024) -lf_county_age_pop = county_age_pop(2024) +lf_state_age_pop = state_age_pop(2024) ``` Low-population county groups diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py index ab4f6ff..69d334e 100644 --- a/src/kintsugi/_data.py +++ b/src/kintsugi/_data.py @@ -22,15 +22,24 @@ "geo/cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3", "geo/cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b", "geo/cb_2024_us_state_5m.zip": "c9db0e395c11a1f94a8017fde4f4c7cbee1dca6eb37ba8f1ccaab927df70885f", - "pop/county_cc/county_pop_2016.parquet": "1d337d32b401b1d101f643e4f734dc62f6fc4659d9c168cab025fcfacdc930ec", - "pop/county_cc/county_pop_2017.parquet": "7f3d834d37d505baee184352cc3c2144cb5dde1745a51356c8b5debc0fddc768", - "pop/county_cc/county_pop_2018.parquet": "45e476c3bbe375b2de44b261bccec032609320de311cc95c02b1125216d8c748", - "pop/county_cc/county_pop_2019.parquet": "06081711d88339c4e2af398e3e7345d336b26b5c3a6148b00f0c4273b51a7f4b", - "pop/county_cc/county_pop_2020.parquet": "4ba406a680041dd3cb4025733fffe852383a9171dcd7ceaaa3fa4e551573dc57", - "pop/county_cc/county_pop_2021.parquet": "527c058c14b8de7826748bb883969bed8960a9e060e2aa010bd6367f41458306", - "pop/county_cc/county_pop_2022.parquet": "bffccaf83d23245378cbc900f5f7bc1740c7dd2c5085570b20d44649d5afcbc1", - "pop/county_cc/county_pop_2023.parquet": "dc5941017a40488424faae38fcca8b7032024523e823af17c0d539b657ee239a", - "pop/county_cc/county_pop_2024.parquet": "cae4e9e5d956dfdd60a68a06887e0c4a1f8918f81e09c8fe2015f3b1feb85d82", + "pop/county_cc/county_pop_2016.parquet": "74caad19bf5eed856ad9b6f63c65f7fceca612dec680d0768890de2265116607", + "pop/county_cc/county_pop_2017.parquet": "d93d027929861e115cf34b15f1ff7c697c8eaa327b73cd8132710a11860a63d5", + "pop/county_cc/county_pop_2018.parquet": "be3d3bab642a9f6f111c792a431f940b1753373194993885e4d47c136feed91a", + "pop/county_cc/county_pop_2019.parquet": "98801f118cd795c026a8269d5ac6674f98b9d47e0207c6a2721a5b7f4b6e5c08", + "pop/county_cc/county_pop_2020.parquet": "f1e4f282d297dc5498b6f839412c0815ca6f9e0a15d83d5d3867f2d70aa8413d", + "pop/county_cc/county_pop_2021.parquet": "3af369564ebb0e1fda25b440e5bf133ecb2d2eab60ab40f5db1f0a0955db713b", + "pop/county_cc/county_pop_2022.parquet": "977856eb5fffd508442ccedaa54c92e338b037135e5a9be55a03c7132863d9ca", + "pop/county_cc/county_pop_2023.parquet": "a4d66c302a557c1565ec9f43bad5ea9d4267576d1fbd17d8939e5a858a3d73e7", + "pop/county_cc/county_pop_2024.parquet": "12b16c7c20329a3df2f4120f6ec9a9a7313147fad0fd03bc360b1de5769c8abd", + "pop/state/state_pop_2016.parquet": "bac51c5ba4a9ff7305e92b3b2804c854fc20b9cbcf01156e5439d92668c0c81e", + "pop/state/state_pop_2017.parquet": "6fb950b1b78409af8130317b08b437b742c0906ff9d5c38655c1189103b8dddc", + "pop/state/state_pop_2018.parquet": "913fca35299028a842325000e58e33cd3912c1e900d480f00b468095398e57f8", + "pop/state/state_pop_2019.parquet": "7ca2c87065f24857178bb33a7512cb799a92890596bac6fff1cbeb3c69f6fc36", + "pop/state/state_pop_2020.parquet": "275b861e07f1c2327fb5382a28e84a5fb7ac4f896ae9f91b06612f6197af9611", + "pop/state/state_pop_2021.parquet": "8b47a5c9fdca838954c8ddac8265ad00d590281c7b444019070c81b9942a727e", + "pop/state/state_pop_2022.parquet": "ea113b3766c44bbf250e01b0b9509e810590119b3b9470b13dc347d43aed042b", + "pop/state/state_pop_2023.parquet": "e96a982342510fe6a1ba90fc85a9bd6fbdd8687bceaf76e6e117606429d2d160", + "pop/state/state_pop_2024.parquet": "b79bca471a68b8c3742ec30d41a2b65ab1227152e81239faf00763188752c6ff", "county_groups.parquet": "7d7c150b5efd5596e0eaaed27abd6dc86137f08ff677c2606d402b9d165b87fa", "state.txt": "bea4e03f71a1fa0045ae732aabad11fa541e5932b071c2369bb0d325e8cba5a0", } diff --git a/src/kintsugi/county_groups.py b/src/kintsugi/county_groups.py index 285abc4..f8a9a42 100644 --- a/src/kintsugi/county_groups.py +++ b/src/kintsugi/county_groups.py @@ -4,7 +4,7 @@ import polars as pl from ._data import get_dataset -from .county_pop import county_pop +from .population import county_pop @overload diff --git a/src/kintsugi/county_pop.py b/src/kintsugi/county_pop.py deleted file mode 100644 index 284780e..0000000 --- a/src/kintsugi/county_pop.py +++ /dev/null @@ -1,176 +0,0 @@ -from typing import Literal, NamedTuple, overload - -import pandas as pd -import polars as pl - -from ._data import get_dataset - -type VintageYear = Literal[ - 2016, - 2017, - 2018, - 2019, - 2020, - 2021, - 2022, - 2023, - 2024, -] - - -class Vintage(NamedTuple): - year_lb: int - year_ub: int - county_fips: set[str] - - -def get_vintage(vintage_year: VintageYear) -> Vintage: - """ - Get info like year bounds for a given vintage year - """ - vintage_year_lb = 2016 - vintage_year_ub = 2024 - if not (vintage_year_lb <= vintage_year <= vintage_year_ub): - raise ValueError( - f"Must choose a vintage year between {vintage_year_lb} and {vintage_year_ub}" - ) - - data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") - county_fips = set( - pl.scan_parquet(data) - .select("county_fips") - .unique() - .collect() - .to_series() - .to_list() - ) - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - return Vintage(year_lb, vintage_year, county_fips) - - -@overload -def county_pop( - year: int, - *, - vintage_year: VintageYear | None = ..., - as_pandas: Literal[False] = ..., -) -> pl.LazyFrame: ... - - -@overload -def county_pop( - year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] -) -> pd.DataFrame: ... - - -def county_pop( - year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False -) -> pl.LazyFrame | pd.DataFrame: - """ - County population estimates for select years. Uses county population - by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html - The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. - - It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). - - Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv - """ - if vintage_year is None: - if 2010 <= year <= 2019: - vintage_year = 2020 - else: - vintage_year = 2024 - - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - - data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") - lf = ( - pl.scan_parquet(data) - .filter( - pl.col("year") == year, - pl.col("age_grp") == "tot", - ) - .select("state_name", "county_name", "county_fips", "year", "tot_pop") - .sort("county_fips") - ) - - if as_pandas: - return lf.collect().to_pandas() - - return lf - - -@overload -def county_age_pop( - year: int, - *, - vintage_year: VintageYear | None = ..., - as_pandas: Literal[False] = ..., -) -> pl.LazyFrame: ... - - -@overload -def county_age_pop( - year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] -) -> pd.DataFrame: ... - - -def county_age_pop( - year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False -) -> pl.LazyFrame | pd.DataFrame: - """ - County-age population estimates for select years. Uses county population - by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html - The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. - - It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). - - Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv - """ - if vintage_year is None: - if 2010 <= year <= 2019: - vintage_year = 2020 - else: - vintage_year = 2024 - - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - - data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") - lf = ( - pl.scan_parquet(data) - .filter( - pl.col("year") == year, - pl.col("age_grp") != "tot", - ) - .select( - "state_name", "county_name", "county_fips", "year", "age_grp", "tot_pop" - ) - .sort("county_fips", "age_grp") - ) - - if as_pandas: - return lf.collect().to_pandas() - - return lf diff --git a/src/kintsugi/population.py b/src/kintsugi/population.py new file mode 100644 index 0000000..ff5ddef --- /dev/null +++ b/src/kintsugi/population.py @@ -0,0 +1,762 @@ +from typing import Literal, NamedTuple, overload + +import pandas as pd +import polars as pl + +from ._data import get_dataset + +type VintageYear = Literal[ + 2016, + 2017, + 2018, + 2019, + 2020, + 2021, + 2022, + 2023, + 2024, +] + + +# class Vintage(NamedTuple): +# year_lb: int +# year_ub: int +# county_fips: set[str] + + +def validate_vintage_year(year: int, vintage_year: VintageYear) -> None: + """Validate year against vintage_year""" + vintage_year_lb = 2016 + vintage_year_ub = 2024 + if not (vintage_year_lb <= vintage_year <= vintage_year_ub): + raise ValueError( + f"Must choose a vintage year between {vintage_year_lb} and {vintage_year_ub}" + ) + + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if not (year_lb <= year <= vintage_year): + raise ValueError(f"Must choose a year between {year_lb} and {vintage_year}") + + +# def _get_vintage(vintage_year: VintageYear) -> Vintage: +# """Get info like year bounds for a given vintage year.""" +# vintage_year_lb = 2016 +# vintage_year_ub = 2024 +# if not (vintage_year_lb <= vintage_year <= vintage_year_ub): +# raise ValueError( +# f"Must choose a vintage year between {vintage_year_lb} and {vintage_year_ub}" +# ) +# +# data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") +# county_fips = set( +# pl.scan_parquet(data) +# .select("county_fips") +# .unique() +# .collect() +# .to_series() +# .to_list() +# ) +# if vintage_year <= 2020: +# year_lb = 2010 +# else: +# year_lb = 2020 +# +# return Vintage(year_lb, vintage_year, county_fips) + + +# TODO: should docstrings have info on the schema? + +# match conventions in kintsugi-data processing script +sex_enum = pl.Enum(["tot", "male", "female"]) +race_enum_no_hispanic = pl.Enum(["white", "black", "aian", "asian", "nhpi"]) +race_enum_incl_hispanic = pl.Enum( + ["white", "black", "aian", "asian", "nhpi", "hispanic"] +) +hispanic_enum = pl.Enum(["tot", "not_hispanic", "hispanic"]) + + +@overload +def state_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State population estimates for select years. + + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html. + The raw files are not present in the kintsugi-data repo. Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") == "tot", + pl.col("hispanic_origin") == "tot", + ) + .group_by(["state_name", "state_fips", "year"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_age_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_age_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_age_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State-age population estimates for select years. + + Age is given in years, not binned groups. Note that an age value of `85` corresponds to >= 85 years old. + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html. + The raw files are not present in the kintsugi-data repo. Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") == "tot", + pl.col("hispanic_origin") == "tot", + ) + .group_by(["state_name", "state_fips", "year", "age"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips", "age") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State-sex population estimates for select years. Uses state population by characteristics + data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html + The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") != "tot", + pl.col("hispanic_origin") == "tot", + ) + .group_by(["state_name", "state_fips", "year", "sex"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips", "sex") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[True], +) -> pd.DataFrame: ... + + +def state_race_pop( + year: int, + *, + vintage_year: VintageYear | None = None, + incl_hispanic_orig: bool = False, + as_pandas: bool = False, +) -> pl.LazyFrame | pd.DataFrame: + """ + State-race population estimates for select years. Specify `incl_hispanic_orig=True` to include + Hispanic counts column. Uses state population by characteristics + data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html + The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") == "tot", + pl.col("hispanic_origin") != "tot" + if incl_hispanic_orig + else pl.col("hispanic_origin") == "tot", + ) + .group_by( + ["state_name", "state_fips", "year", "race", "hispanic_origin"] + if incl_hispanic_orig + else ["state_name", "state_fips", "year", "race"] + ) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort( + ["state_fips", "race", "hispanic_origin"] + if incl_hispanic_orig + else ["state_fips", "race"] + ) + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_age_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State-age-sex population estimates for select years. + + Age is given in years, not binned groups. Note that an age value of `85` corresponds to >= 85 years old. + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html. + The raw files are not present in the kintsugi-data repo. Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") != "tot", + pl.col("hispanic_origin") == "tot", + ) + .group_by(["state_name", "state_fips", "year", "age", "sex"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips", "age", "sex") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +age_grps = [ + "tot", + "0-4", + "5-9", + "10-14", + "15-19", + "20-24", + "25-29", + "30-34", + "35-39", + "40-44", + "45-49", + "50-54", + "55-59", + "60-64", + "65-69", + "70-74", + "75-79", + "80-84", + ">=85", +] +age_grp_enum = pl.Enum(age_grps) + + +@overload +def county_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") == "tot", + ) + .select("state_name", "county_name", "county_fips", "year", "tot_pop") + .sort("county_fips") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_age_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_age_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_age_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County-age population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") != "tot", + ) + .select( + "state_name", "county_name", "county_fips", "year", "age_grp", "tot_pop" + ) + .sort("county_fips", "age_grp") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County-sex population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") == "tot", + ) + .select( + "state_name", "county_name", "county_fips", "year", "tot_male", "tot_female" + ) + .unpivot( + index=["state_name", "county_name", "county_fips", "year"], + variable_name="sex", + value_name="tot_pop", + ) + .with_columns(sex=pl.col("sex").str.replace("tot_", "").cast(sex_enum)) + .sort("county_fips", "sex") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[True], +) -> pd.DataFrame: ... + + +def county_race_pop( + year: int, + *, + vintage_year: VintageYear | None = None, + incl_hispanic_orig: bool = False, + as_pandas: bool = False, +) -> pl.LazyFrame | pd.DataFrame: + """ + County-race population estimates for select years. Specify `incl_hispanic_orig=True` to include + Hispanic counts column. Uses county population by characteristics + data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") == "tot", + ) + .select( + "state_name", + "county_name", + "county_fips", + "year", + "white_male", + "white_female", + "black_male", + "black_female", + "aian_male", + "aian_female", + "asian_male", + "asian_female", + "nhpi_male", + "nhpi_female", + "hispanic_male", + "hispanic_female", + ) + .with_columns( + (pl.col(f"{r}_male") + pl.col(f"{r}_female")).alias(r) + for r in ["white", "black", "aian", "asian", "nhpi", "hispanic"] + ) + .select( + "state_name", + "county_name", + "county_fips", + "year", + "white", + "black", + "aian", + "asian", + "nhpi", + "hispanic", + ) + ) + + if not incl_hispanic_orig: + lf = lf.drop("hispanic") + + lf = ( + lf.unpivot( + index=["state_name", "county_name", "county_fips", "year"], + variable_name="race", + value_name="tot_pop", + ) + .cast({"race": race_enum_incl_hispanic}) + .sort("county_fips", "race") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_age_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County-age-sex population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, parquet files containing a subset of columns are used. + + It's recommended to use the latest possible vintage to get a given year's data. However, + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + validate_vintage_year(year, vintage_year) + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") != "tot", + ) + .select( + "state_name", + "county_name", + "county_fips", + "year", + "age_grp", + "tot_male", + "tot_female", + ) + .unpivot( + index=["state_name", "county_name", "county_fips", "year", "age_grp"], + variable_name="sex", + value_name="tot_pop", + ) + .with_columns(sex=pl.col("sex").str.replace("tot_", "").cast(sex_enum)) + .sort("county_fips", "age_grp", "sex") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf diff --git a/tests/county_pop_test.py b/tests/county_pop_test.py deleted file mode 100644 index 68d1a48..0000000 --- a/tests/county_pop_test.py +++ /dev/null @@ -1,175 +0,0 @@ -import pandera.polars as pa -import polars as pl -import pytest -from pandas import DataFrame -from pandera.polars import PolarsData - -from kintsugi.county_pop import ( - VintageYear, - county_age_pop, - county_pop, - get_vintage, -) - -from .models import BasePolarsModel - -age_grps = { - 0: "tot", - 1: "0-4", - 2: "5-9", - 3: "10-14", - 4: "15-19", - 5: "20-24", - 6: "25-29", - 7: "30-34", - 8: "35-39", - 9: "40-44", - 10: "45-49", - 11: "50-54", - 12: "55-59", - 13: "60-64", - 14: "65-69", - 15: "70-74", - 16: "75-79", - 17: "80-84", - 18: ">=85", -} -age_grp_enum = pl.Enum(age_grps.values()) - - -class CountyPopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = ["state_name", "county_name", "county_fips", "year"] - - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_county_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - county_pop(year, vintage_year=vintage_year).collect().pipe( - CountyPopulation.validate, lazy=True - ) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - county_pop(year, vintage_year=vintage_year) - - -def test_county_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] - - -def test_get_vintage_info() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - get_vintage(2000) # pyright: ignore [reportArgumentType] - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_county_pop_as_pandas(year: int) -> None: - df = county_pop(year, as_pandas=True) - - assert isinstance(df, DataFrame) - - -class CountyAgePopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] - tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "county_name", - "county_fips", - "year", - "age_grp", - ] - - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - county_age_pop(year, vintage_year=vintage_year).collect().pipe( - CountyAgePopulation.validate, lazy=True - ) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - county_age_pop(year, vintage_year=vintage_year) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_county_age_pop_as_pandas(year: int) -> None: - df = county_age_pop(year, as_pandas=True) - - assert isinstance(df, DataFrame) - - -def test_county_age_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] diff --git a/tests/population_test.py b/tests/population_test.py new file mode 100644 index 0000000..aef3185 --- /dev/null +++ b/tests/population_test.py @@ -0,0 +1,786 @@ +import pandera.polars as pa +import polars as pl +import pytest +from pandas import DataFrame +from pandera.polars import PolarsData + +from kintsugi.population import ( + VintageYear, + age_grp_enum, + county_age_pop, + county_age_sex_pop, + county_pop, + county_race_pop, + county_sex_pop, + hispanic_enum, + race_enum_incl_hispanic, + race_enum_no_hispanic, + sex_enum, + state_age_pop, + state_age_sex_pop, + state_pop, + state_race_pop, + state_sex_pop, +) + +from .models import BasePolarsModel + + +class StatePopulation(BasePolarsModel): + state_name: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] + state_fips: pl.String = pa.Field(unique=True, in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "state_fips", "year"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_height(cls, data: PolarsData) -> bool: + return data.lazyframe.select(pl.len()).collect().item() == 51 # pyright: ignore [reportAny] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_pop(year, vintage_year=vintage_year).collect().pipe( + StatePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_pop(year, vintage_year=vintage_year) + + +def test_state_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_pop_as_pandas(year: int) -> None: + df = state_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +class StateAgePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age: pl.Int64 = pa.Field(in_range=(0, 85)) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "state_fips", "year", "age"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_age_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_age_pop(year, vintage_year=vintage_year).collect().pipe( + StateAgePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_age_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_age_pop_as_pandas(year: int) -> None: + df = state_age_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_state_age_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class StateSexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "state_fips", "year", "sex"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_sex_pop(year, vintage_year=vintage_year).collect().pipe( + StateSexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_sex_pop_as_pandas(year: int) -> None: + df = state_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_state_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class StateRacePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": race_enum_no_hispanic.categories} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "state_fips", "year", "race"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_race_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_race_pop(year, vintage_year=vintage_year).collect().pipe( + StateRacePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_race_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_race_pop_as_pandas(year: int) -> None: + df = state_race_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_state_race_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class StateRaceHispanicPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": race_enum_no_hispanic.categories} + ) + hispanic_origin: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": hispanic_enum.categories} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "state_fips", + "year", + "race", + "hispanic_origin", + ] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_race_pop( + year, vintage_year=vintage_year, incl_hispanic_orig=True + ).collect().pipe(StateRaceHispanicPopulation.validate, lazy=True) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_race_hispanic_pop_as_pandas(year: int) -> None: + df = state_race_pop(year, as_pandas=True, incl_hispanic_orig=True) + + assert isinstance(df, DataFrame) + + +def test_state_race_hispanic_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] + + +class StateAgeSexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age: pl.Int64 = pa.Field(in_range=(0, 85)) # pyright: ignore [reportAny] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "state_fips", "year", "age", "sex"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_age_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_age_sex_pop(year, vintage_year=vintage_year).collect().pipe( + StateAgeSexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_age_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_age_sex_pop_as_pandas(year: int) -> None: + df = state_age_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_state_age_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_age_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountyPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_pop(year, vintage_year=vintage_year).collect().pipe( + CountyPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_pop(year, vintage_year=vintage_year) + + +def test_county_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_pop_as_pandas(year: int) -> None: + df = county_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +class CountyAgePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + "age_grp", + ] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_age_pop(year, vintage_year=vintage_year).collect().pipe( + CountyAgePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_age_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_age_pop_as_pandas(year: int) -> None: + df = county_age_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_age_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountySexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year", "sex"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_sex_pop(year, vintage_year=vintage_year).collect().pipe( + CountySexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_sex_pop_as_pandas(year: int) -> None: + df = county_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountyRacePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": race_enum_incl_hispanic.categories} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year", "race"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_race_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_race_pop(year, vintage_year=vintage_year).collect().pipe( + CountyRacePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_race_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_race_pop_as_pandas(year: int) -> None: + df = county_race_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_race_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountyRaceHispanicPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": race_enum_incl_hispanic.categories} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year", "race"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_race_pop( + year, vintage_year=vintage_year, incl_hispanic_orig=True + ).collect().pipe(CountyRaceHispanicPopulation.validate, lazy=True) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_race_hispanic_pop_as_pandas(year: int) -> None: + df = county_race_pop(year, as_pandas=True, incl_hispanic_orig=True) + + assert isinstance(df, DataFrame) + + +def test_county_race_hispanic_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] + + +class CountyAgeSexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + "age_grp", + "sex", + ] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_age_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_age_sex_pop(year, vintage_year=vintage_year).collect().pipe( + CountyAgeSexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_age_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_age_sex_pop_as_pandas(year: int) -> None: + df = county_age_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_age_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_age_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType]