From 584e738bd76a8cfb6247701dce5dde0dd52a724e Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:16:06 -0400 Subject: [PATCH 1/4] Consolidate county and state population data into a single module --- README.md | 6 +- src/kintsugi/_data.py | 27 +- src/kintsugi/county_pop.py | 176 ---------- src/kintsugi/population.py | 678 ++++++++++++++++++++++++++++++++++++ tests/county_pop_test.py | 175 ---------- tests/population_test.py | 681 +++++++++++++++++++++++++++++++++++++ 6 files changed, 1380 insertions(+), 363 deletions(-) delete mode 100644 src/kintsugi/county_pop.py create mode 100644 src/kintsugi/population.py delete mode 100644 tests/county_pop_test.py create mode 100644 tests/population_test.py diff --git a/README.md b/README.md index c5e5ca0..d1a2131 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,13 @@ counties = county_geo(2024) states = state_geo(2024) ``` -County and county-age population counts +State and county population data, stratified by several different variables: ```python -from kintsugi.county_pop import county_pop, county_age_pop +from kintsugi.population import county_pop, state_age_pop lf_county_pop = county_pop(2024) -lf_county_age_pop = county_age_pop(2024) +lf_state_age_pop = state_age_pop(2024) ``` Low-population county groups diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py index ab4f6ff..69d334e 100644 --- a/src/kintsugi/_data.py +++ b/src/kintsugi/_data.py @@ -22,15 +22,24 @@ "geo/cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3", "geo/cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b", "geo/cb_2024_us_state_5m.zip": "c9db0e395c11a1f94a8017fde4f4c7cbee1dca6eb37ba8f1ccaab927df70885f", - "pop/county_cc/county_pop_2016.parquet": "1d337d32b401b1d101f643e4f734dc62f6fc4659d9c168cab025fcfacdc930ec", - "pop/county_cc/county_pop_2017.parquet": "7f3d834d37d505baee184352cc3c2144cb5dde1745a51356c8b5debc0fddc768", - "pop/county_cc/county_pop_2018.parquet": "45e476c3bbe375b2de44b261bccec032609320de311cc95c02b1125216d8c748", - "pop/county_cc/county_pop_2019.parquet": "06081711d88339c4e2af398e3e7345d336b26b5c3a6148b00f0c4273b51a7f4b", - "pop/county_cc/county_pop_2020.parquet": "4ba406a680041dd3cb4025733fffe852383a9171dcd7ceaaa3fa4e551573dc57", - "pop/county_cc/county_pop_2021.parquet": "527c058c14b8de7826748bb883969bed8960a9e060e2aa010bd6367f41458306", - "pop/county_cc/county_pop_2022.parquet": "bffccaf83d23245378cbc900f5f7bc1740c7dd2c5085570b20d44649d5afcbc1", - "pop/county_cc/county_pop_2023.parquet": "dc5941017a40488424faae38fcca8b7032024523e823af17c0d539b657ee239a", - "pop/county_cc/county_pop_2024.parquet": "cae4e9e5d956dfdd60a68a06887e0c4a1f8918f81e09c8fe2015f3b1feb85d82", + "pop/county_cc/county_pop_2016.parquet": "74caad19bf5eed856ad9b6f63c65f7fceca612dec680d0768890de2265116607", + "pop/county_cc/county_pop_2017.parquet": "d93d027929861e115cf34b15f1ff7c697c8eaa327b73cd8132710a11860a63d5", + "pop/county_cc/county_pop_2018.parquet": "be3d3bab642a9f6f111c792a431f940b1753373194993885e4d47c136feed91a", + "pop/county_cc/county_pop_2019.parquet": "98801f118cd795c026a8269d5ac6674f98b9d47e0207c6a2721a5b7f4b6e5c08", + "pop/county_cc/county_pop_2020.parquet": "f1e4f282d297dc5498b6f839412c0815ca6f9e0a15d83d5d3867f2d70aa8413d", + "pop/county_cc/county_pop_2021.parquet": "3af369564ebb0e1fda25b440e5bf133ecb2d2eab60ab40f5db1f0a0955db713b", + "pop/county_cc/county_pop_2022.parquet": "977856eb5fffd508442ccedaa54c92e338b037135e5a9be55a03c7132863d9ca", + "pop/county_cc/county_pop_2023.parquet": "a4d66c302a557c1565ec9f43bad5ea9d4267576d1fbd17d8939e5a858a3d73e7", + "pop/county_cc/county_pop_2024.parquet": "12b16c7c20329a3df2f4120f6ec9a9a7313147fad0fd03bc360b1de5769c8abd", + "pop/state/state_pop_2016.parquet": "bac51c5ba4a9ff7305e92b3b2804c854fc20b9cbcf01156e5439d92668c0c81e", + "pop/state/state_pop_2017.parquet": "6fb950b1b78409af8130317b08b437b742c0906ff9d5c38655c1189103b8dddc", + "pop/state/state_pop_2018.parquet": "913fca35299028a842325000e58e33cd3912c1e900d480f00b468095398e57f8", + "pop/state/state_pop_2019.parquet": "7ca2c87065f24857178bb33a7512cb799a92890596bac6fff1cbeb3c69f6fc36", + "pop/state/state_pop_2020.parquet": "275b861e07f1c2327fb5382a28e84a5fb7ac4f896ae9f91b06612f6197af9611", + "pop/state/state_pop_2021.parquet": "8b47a5c9fdca838954c8ddac8265ad00d590281c7b444019070c81b9942a727e", + "pop/state/state_pop_2022.parquet": "ea113b3766c44bbf250e01b0b9509e810590119b3b9470b13dc347d43aed042b", + "pop/state/state_pop_2023.parquet": "e96a982342510fe6a1ba90fc85a9bd6fbdd8687bceaf76e6e117606429d2d160", + "pop/state/state_pop_2024.parquet": "b79bca471a68b8c3742ec30d41a2b65ab1227152e81239faf00763188752c6ff", "county_groups.parquet": "7d7c150b5efd5596e0eaaed27abd6dc86137f08ff677c2606d402b9d165b87fa", "state.txt": "bea4e03f71a1fa0045ae732aabad11fa541e5932b071c2369bb0d325e8cba5a0", } diff --git a/src/kintsugi/county_pop.py b/src/kintsugi/county_pop.py deleted file mode 100644 index 284780e..0000000 --- a/src/kintsugi/county_pop.py +++ /dev/null @@ -1,176 +0,0 @@ -from typing import Literal, NamedTuple, overload - -import pandas as pd -import polars as pl - -from ._data import get_dataset - -type VintageYear = Literal[ - 2016, - 2017, - 2018, - 2019, - 2020, - 2021, - 2022, - 2023, - 2024, -] - - -class Vintage(NamedTuple): - year_lb: int - year_ub: int - county_fips: set[str] - - -def get_vintage(vintage_year: VintageYear) -> Vintage: - """ - Get info like year bounds for a given vintage year - """ - vintage_year_lb = 2016 - vintage_year_ub = 2024 - if not (vintage_year_lb <= vintage_year <= vintage_year_ub): - raise ValueError( - f"Must choose a vintage year between {vintage_year_lb} and {vintage_year_ub}" - ) - - data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") - county_fips = set( - pl.scan_parquet(data) - .select("county_fips") - .unique() - .collect() - .to_series() - .to_list() - ) - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - return Vintage(year_lb, vintage_year, county_fips) - - -@overload -def county_pop( - year: int, - *, - vintage_year: VintageYear | None = ..., - as_pandas: Literal[False] = ..., -) -> pl.LazyFrame: ... - - -@overload -def county_pop( - year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] -) -> pd.DataFrame: ... - - -def county_pop( - year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False -) -> pl.LazyFrame | pd.DataFrame: - """ - County population estimates for select years. Uses county population - by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html - The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. - - It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). - - Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv - """ - if vintage_year is None: - if 2010 <= year <= 2019: - vintage_year = 2020 - else: - vintage_year = 2024 - - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - - data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") - lf = ( - pl.scan_parquet(data) - .filter( - pl.col("year") == year, - pl.col("age_grp") == "tot", - ) - .select("state_name", "county_name", "county_fips", "year", "tot_pop") - .sort("county_fips") - ) - - if as_pandas: - return lf.collect().to_pandas() - - return lf - - -@overload -def county_age_pop( - year: int, - *, - vintage_year: VintageYear | None = ..., - as_pandas: Literal[False] = ..., -) -> pl.LazyFrame: ... - - -@overload -def county_age_pop( - year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] -) -> pd.DataFrame: ... - - -def county_age_pop( - year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False -) -> pl.LazyFrame | pd.DataFrame: - """ - County-age population estimates for select years. Uses county population - by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html - The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. - - It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). - - Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv - """ - if vintage_year is None: - if 2010 <= year <= 2019: - vintage_year = 2020 - else: - vintage_year = 2024 - - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - - data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") - lf = ( - pl.scan_parquet(data) - .filter( - pl.col("year") == year, - pl.col("age_grp") != "tot", - ) - .select( - "state_name", "county_name", "county_fips", "year", "age_grp", "tot_pop" - ) - .sort("county_fips", "age_grp") - ) - - if as_pandas: - return lf.collect().to_pandas() - - return lf diff --git a/src/kintsugi/population.py b/src/kintsugi/population.py new file mode 100644 index 0000000..0ce8ac9 --- /dev/null +++ b/src/kintsugi/population.py @@ -0,0 +1,678 @@ +from typing import Literal, NamedTuple, overload + +import pandas as pd +import polars as pl + +from ._data import get_dataset + +type VintageYear = Literal[ + 2016, + 2017, + 2018, + 2019, + 2020, + 2021, + 2022, + 2023, + 2024, +] + + +class Vintage(NamedTuple): + year_lb: int + year_ub: int + county_fips: set[str] + + +def get_vintage(vintage_year: VintageYear) -> Vintage: + """ + Get info like year bounds for a given vintage year + """ + vintage_year_lb = 2016 + vintage_year_ub = 2024 + if not (vintage_year_lb <= vintage_year <= vintage_year_ub): + raise ValueError( + f"Must choose a vintage year between {vintage_year_lb} and {vintage_year_ub}" + ) + + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + county_fips = set( + pl.scan_parquet(data) + .select("county_fips") + .unique() + .collect() + .to_series() + .to_list() + ) + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + return Vintage(year_lb, vintage_year, county_fips) + + +@overload +def state_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State population estimates for select years. Uses state population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html + The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") == "tot", + pl.col("hispanic_origin") == "tot", + ) + .drop("sex", "hispanic_origin") + .group_by(["state_name", "state_fips", "year"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_age_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_age_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_age_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State-age population estimates for select years. Age is given in years, not binned groups. + Note that an age value of `85` corresponds to >= 85 years old. + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html + The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") == "tot", + pl.col("hispanic_origin") == "tot", + ) + .drop("sex", "hispanic_origin") + .group_by(["state_name", "state_fips", "year", "age"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips", "age") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State-sex population estimates for select years. Uses state population by characteristics + data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html + The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") != "tot", + pl.col("hispanic_origin") == "tot", + ) + .drop("hispanic_origin") + .group_by(["state_name", "state_fips", "year", "sex"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips", "sex") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[True], +) -> pd.DataFrame: ... + + +def state_race_pop( + year: int, + *, + vintage_year: VintageYear | None = None, + incl_hispanic_orig: bool = False, + as_pandas: bool = False, +) -> pl.LazyFrame | pd.DataFrame: + """ + State-race population estimates for select years. Specify `incl_hispanic=True` to include + Hispanic counts column. Uses state population by characteristics + data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html + The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") == "tot", + pl.col("hispanic_origin") != "tot" + if incl_hispanic_orig + else pl.col("hispanic_origin") == "tot", + ) + .drop("sex") + .group_by( + ["state_name", "state_fips", "year", "race", "hispanic_origin"] + if incl_hispanic_orig + else ["state_name", "state_fips", "year", "race"] + ) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort( + ["state_fips", "race", "hispanic_origin"] + if incl_hispanic_orig + else ["state_fips", "race"] + ) + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def state_age_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def state_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def state_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + State-age-sex population estimates for select years. Age is given in years, not binned groups. + Note that an age value of `85` corresponds to >= 85 years old. + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html + The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/state/asrh/sc-est2024-alldata5.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("sex") != "tot", + pl.col("hispanic_origin") == "tot", + ) + .drop("hispanic_origin") + .group_by(["state_name", "state_fips", "year", "age", "sex"]) + .agg(tot_pop=pl.col("tot_pop").sum()) + .sort("state_fips", "age", "sex") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year if, for example, you need a certain set of county + geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] + are sourced from the 2020 vintage (2010-2020 data), while data for years in the range + [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") == "tot", + ) + .select("state_name", "county_name", "county_fips", "year", "tot_pop") + .sort("county_fips") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_age_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_age_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_age_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County-age population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year if, for example, you need a certain set of county + geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] + are sourced from the 2020 vintage (2010-2020 data), while data for years in the range + [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") != "tot", + ) + .select( + "state_name", "county_name", "county_fips", "year", "age_grp", "tot_pop" + ) + .sort("county_fips", "age_grp") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County-sex population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year if, for example, you need a certain set of county + geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] + are sourced from the 2020 vintage (2010-2020 data), while data for years in the range + [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") == "tot", + ) + .select( + "state_name", "county_name", "county_fips", "year", "tot_male", "tot_female" + ) + .sort("county_fips") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_race_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + incl_hispanic_orig: bool = ..., + as_pandas: Literal[True], +) -> pd.DataFrame: ... + + +def county_race_pop( + year: int, + *, + vintage_year: VintageYear | None = None, + incl_hispanic_orig: bool = False, + as_pandas: bool = False, +) -> pl.LazyFrame | pd.DataFrame: + """ + County-race population estimates for select years. Specify `incl_hispanic=True` to include + Hispanic counts column. Uses county population by characteristics + data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year if, for example, you need a certain set of county + geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] + are sourced from the 2020 vintage (2010-2020 data), while data for years in the range + [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") == "tot", + ) + .select( + "state_name", + "county_name", + "county_fips", + "year", + "white_male", + "white_female", + "black_male", + "black_female", + "aian_male", + "aian_female", + "asian_male", + "asian_female", + "nhpi_male", + "nhpi_female", + "hispanic_male", + "hispanic_female", + ) + .with_columns( + (pl.col(f"{r}_male") + pl.col(f"{r}_female")).alias(r) + for r in ["white", "black", "aian", "asian", "nhpi", "hispanic"] + ) + .select( + "state_name", + "county_name", + "county_fips", + "year", + "white", + "black", + "aian", + "asian", + "nhpi", + "hispanic", + ) + .sort("county_fips") + ) + + if not incl_hispanic_orig: + lf = lf.drop("hispanic") + + if as_pandas: + return lf.collect().to_pandas() + + return lf diff --git a/tests/county_pop_test.py b/tests/county_pop_test.py deleted file mode 100644 index 68d1a48..0000000 --- a/tests/county_pop_test.py +++ /dev/null @@ -1,175 +0,0 @@ -import pandera.polars as pa -import polars as pl -import pytest -from pandas import DataFrame -from pandera.polars import PolarsData - -from kintsugi.county_pop import ( - VintageYear, - county_age_pop, - county_pop, - get_vintage, -) - -from .models import BasePolarsModel - -age_grps = { - 0: "tot", - 1: "0-4", - 2: "5-9", - 3: "10-14", - 4: "15-19", - 5: "20-24", - 6: "25-29", - 7: "30-34", - 8: "35-39", - 9: "40-44", - 10: "45-49", - 11: "50-54", - 12: "55-59", - 13: "60-64", - 14: "65-69", - 15: "70-74", - 16: "75-79", - 17: "80-84", - 18: ">=85", -} -age_grp_enum = pl.Enum(age_grps.values()) - - -class CountyPopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = ["state_name", "county_name", "county_fips", "year"] - - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_county_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - county_pop(year, vintage_year=vintage_year).collect().pipe( - CountyPopulation.validate, lazy=True - ) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - county_pop(year, vintage_year=vintage_year) - - -def test_county_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] - - -def test_get_vintage_info() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - get_vintage(2000) # pyright: ignore [reportArgumentType] - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_county_pop_as_pandas(year: int) -> None: - df = county_pop(year, as_pandas=True) - - assert isinstance(df, DataFrame) - - -class CountyAgePopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] - tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "county_name", - "county_fips", - "year", - "age_grp", - ] - - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - county_age_pop(year, vintage_year=vintage_year).collect().pipe( - CountyAgePopulation.validate, lazy=True - ) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - county_age_pop(year, vintage_year=vintage_year) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_county_age_pop_as_pandas(year: int) -> None: - df = county_age_pop(year, as_pandas=True) - - assert isinstance(df, DataFrame) - - -def test_county_age_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] diff --git a/tests/population_test.py b/tests/population_test.py new file mode 100644 index 0000000..0ecaaa8 --- /dev/null +++ b/tests/population_test.py @@ -0,0 +1,681 @@ +import pandera.polars as pa +import polars as pl +import pytest +from pandas import DataFrame +from pandera.polars import PolarsData + +from kintsugi.population import ( + VintageYear, + county_age_pop, + county_pop, + county_race_pop, + county_sex_pop, + get_vintage, + state_age_pop, + state_age_sex_pop, + state_pop, + state_race_pop, + state_sex_pop, +) + +from .models import BasePolarsModel + + +class StatePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(unique=True, in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "state_fips", "year"] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_pop(year, vintage_year=vintage_year).collect().pipe( + StatePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_pop(year, vintage_year=vintage_year) + + +def test_state_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_pop_as_pandas(year: int) -> None: + df = state_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +class StateAgePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age: pl.Int64 = pa.Field(in_range=(0, 85)) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "state_fips", + "year", + "age", + ] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_age_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_age_pop(year, vintage_year=vintage_year).collect().pipe( + StateAgePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_age_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_age_pop_as_pandas(year: int) -> None: + df = state_age_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_state_age_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class StateSexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": ["tot", "male", "female"]}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "state_fips", + "year", + "sex", + ] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_sex_pop(year, vintage_year=vintage_year).collect().pipe( + StateSexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_sex_pop_as_pandas(year: int) -> None: + df = state_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_state_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class StateRacePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": ["white", "black", "aian", "asian", "nhpi"]} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "state_fips", + "year", + "race", + ] + + +class StateRaceHispanicPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": ["white", "black", "aian", "asian", "nhpi"]} + ) + hispanic_origin: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": ["tot", "not_hispanic", "hispanic"]} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "state_fips", + "year", + "race", + "hispanic_origin", + ] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_race_pop( + year, vintage_year=vintage_year, incl_hispanic_orig=True + ).collect().pipe(StateRaceHispanicPopulation.validate, lazy=True) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_race_hispanic_pop_as_pandas(year: int) -> None: + df = state_race_pop(year, as_pandas=True, incl_hispanic_orig=True) + + assert isinstance(df, DataFrame) + + +def test_state_race_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] + + +class StateAgeSexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age: pl.Int64 = pa.Field(in_range=(0, 85)) # pyright: ignore [reportAny] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": ["tot", "male", "female"]}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "state_fips", "year", "age", "sex"] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_state_age_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + state_age_sex_pop(year, vintage_year=vintage_year).collect().pipe( + StateAgeSexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + state_age_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_state_age_sex_pop_as_pandas(year: int) -> None: + df = state_age_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_state_age_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + state_age_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +age_grps = { + 0: "tot", + 1: "0-4", + 2: "5-9", + 3: "10-14", + 4: "15-19", + 5: "20-24", + 6: "25-29", + 7: "30-34", + 8: "35-39", + 9: "40-44", + 10: "45-49", + 11: "50-54", + 12: "55-59", + 13: "60-64", + 14: "65-69", + 15: "70-74", + 16: "75-79", + 17: "80-84", + 18: ">=85", +} +age_grp_enum = pl.Enum(age_grps.values()) + + +class CountyPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year"] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_pop(year, vintage_year=vintage_year).collect().pipe( + CountyPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_pop(year, vintage_year=vintage_year) + + +def test_county_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +def test_get_vintage_info() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + get_vintage(2000) # pyright: ignore [reportArgumentType] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_pop_as_pandas(year: int) -> None: + df = county_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +class CountyAgePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + "age_grp", + ] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_age_pop(year, vintage_year=vintage_year).collect().pipe( + CountyAgePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_age_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_age_pop_as_pandas(year: int) -> None: + df = county_age_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_age_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountySexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + tot_male: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + tot_female: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + ] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_sex_pop(year, vintage_year=vintage_year).collect().pipe( + CountySexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_sex_pop_as_pandas(year: int) -> None: + df = county_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountyRacePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + ] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +class CountyRaceHispanicPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + hispanic: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + ] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_race_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_race_pop(year, vintage_year=vintage_year).collect().pipe( + CountyRacePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_race_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_race_pop_as_pandas(year: int) -> None: + df = county_race_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_race_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_race_pop( + year, vintage_year=vintage_year, incl_hispanic_orig=True + ).collect().pipe(CountyRaceHispanicPopulation.validate, lazy=True) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_race_hispanic_pop_as_pandas(year: int) -> None: + df = county_race_pop(year, as_pandas=True, incl_hispanic_orig=True) + + assert isinstance(df, DataFrame) + + +def test_county_race_hispanic_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] From 33486f072bba453954c4994477563018afba2c54 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:18:37 -0400 Subject: [PATCH 2/4] Validate year against vintage year in separate function. Refine state population functions and their tests. --- src/kintsugi/population.py | 206 ++++++--- tests/population_test.py | 840 ++++++++++++++++++++----------------- 2 files changed, 588 insertions(+), 458 deletions(-) diff --git a/src/kintsugi/population.py b/src/kintsugi/population.py index 0ce8ac9..a98a610 100644 --- a/src/kintsugi/population.py +++ b/src/kintsugi/population.py @@ -18,16 +18,14 @@ ] -class Vintage(NamedTuple): - year_lb: int - year_ub: int - county_fips: set[str] +# class Vintage(NamedTuple): +# year_lb: int +# year_ub: int +# county_fips: set[str] -def get_vintage(vintage_year: VintageYear) -> Vintage: - """ - Get info like year bounds for a given vintage year - """ +def validate_vintage_year(year: int, vintage_year: VintageYear) -> None: + """Validate year against vintage_year""" vintage_year_lb = 2016 vintage_year_ub = 2024 if not (vintage_year_lb <= vintage_year <= vintage_year_ub): @@ -35,21 +33,47 @@ def get_vintage(vintage_year: VintageYear) -> Vintage: f"Must choose a vintage year between {vintage_year_lb} and {vintage_year_ub}" ) - data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") - county_fips = set( - pl.scan_parquet(data) - .select("county_fips") - .unique() - .collect() - .to_series() - .to_list() - ) if vintage_year <= 2020: year_lb = 2010 else: year_lb = 2020 - return Vintage(year_lb, vintage_year, county_fips) + if not (year_lb <= year <= vintage_year): + raise ValueError(f"Must choose a year between {year_lb} and {vintage_year}") + + +# def _get_vintage(vintage_year: VintageYear) -> Vintage: +# """Get info like year bounds for a given vintage year.""" +# vintage_year_lb = 2016 +# vintage_year_ub = 2024 +# if not (vintage_year_lb <= vintage_year <= vintage_year_ub): +# raise ValueError( +# f"Must choose a vintage year between {vintage_year_lb} and {vintage_year_ub}" +# ) +# +# data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") +# county_fips = set( +# pl.scan_parquet(data) +# .select("county_fips") +# .unique() +# .collect() +# .to_series() +# .to_list() +# ) +# if vintage_year <= 2020: +# year_lb = 2010 +# else: +# year_lb = 2020 +# +# return Vintage(year_lb, vintage_year, county_fips) + + +# TODO: should docstrings have info on the schema? + +# match conventions in kintsugi-data processing script +sex_enum = pl.Enum(["tot", "male", "female"]) +race_enum = pl.Enum(["white", "black", "aian", "asian", "nhpi"]) +hispanic_enum = pl.Enum(["tot", "not_hispanic", "hispanic"]) @overload @@ -71,12 +95,13 @@ def state_pop( year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False ) -> pl.LazyFrame | pd.DataFrame: """ - State population estimates for select years. Uses state population - by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html - The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + State population estimates for select years. + + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html. + The raw files are not present in the kintsugi-data repo. Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). @@ -88,12 +113,7 @@ def state_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -102,7 +122,6 @@ def state_pop( pl.col("sex") == "tot", pl.col("hispanic_origin") == "tot", ) - .drop("sex", "hispanic_origin") .group_by(["state_name", "state_fips", "year"]) .agg(tot_pop=pl.col("tot_pop").sum()) .sort("state_fips") @@ -133,13 +152,14 @@ def state_age_pop( year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False ) -> pl.LazyFrame | pd.DataFrame: """ - State-age population estimates for select years. Age is given in years, not binned groups. - Note that an age value of `85` corresponds to >= 85 years old. - Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html - The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + State-age population estimates for select years. + + Age is given in years, not binned groups. Note that an age value of `85` corresponds to >= 85 years old. + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html. + The raw files are not present in the kintsugi-data repo. Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). @@ -151,12 +171,7 @@ def state_age_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -165,7 +180,6 @@ def state_age_pop( pl.col("sex") == "tot", pl.col("hispanic_origin") == "tot", ) - .drop("sex", "hispanic_origin") .group_by(["state_name", "state_fips", "year", "age"]) .agg(tot_pop=pl.col("tot_pop").sum()) .sort("state_fips", "age") @@ -213,12 +227,7 @@ def state_sex_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -227,7 +236,6 @@ def state_sex_pop( pl.col("sex") != "tot", pl.col("hispanic_origin") == "tot", ) - .drop("hispanic_origin") .group_by(["state_name", "state_fips", "year", "sex"]) .agg(tot_pop=pl.col("tot_pop").sum()) .sort("state_fips", "sex") @@ -285,12 +293,7 @@ def state_race_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -301,7 +304,6 @@ def state_race_pop( if incl_hispanic_orig else pl.col("hispanic_origin") == "tot", ) - .drop("sex") .group_by( ["state_name", "state_fips", "year", "race", "hispanic_origin"] if incl_hispanic_orig @@ -340,13 +342,14 @@ def state_age_sex_pop( year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False ) -> pl.LazyFrame | pd.DataFrame: """ - State-age-sex population estimates for select years. Age is given in years, not binned groups. - Note that an age value of `85` corresponds to >= 85 years old. - Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html - The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. + State-age-sex population estimates for select years. + + Age is given in years, not binned groups. Note that an age value of `85` corresponds to >= 85 years old. + Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html. + The raw files are not present in the kintsugi-data repo. Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year. If `vintage_year` is `None` (by default), data + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). @@ -358,12 +361,7 @@ def state_age_sex_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/state/state_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -372,7 +370,6 @@ def state_age_sex_pop( pl.col("sex") != "tot", pl.col("hispanic_origin") == "tot", ) - .drop("hispanic_origin") .group_by(["state_name", "state_fips", "year", "age", "sex"]) .agg(tot_pop=pl.col("tot_pop").sum()) .sort("state_fips", "age", "sex") @@ -676,3 +673,78 @@ def county_race_pop( return lf.collect().to_pandas() return lf + + +@overload +def county_age_sex_pop( + year: int, + *, + vintage_year: VintageYear | None = ..., + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def county_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = ..., as_pandas: Literal[True] +) -> pd.DataFrame: ... + + +def county_age_sex_pop( + year: int, *, vintage_year: VintageYear | None = None, as_pandas: bool = False +) -> pl.LazyFrame | pd.DataFrame: + """ + County-age-sex population estimates for select years. Uses county population + by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html + The raw files are not present in the kintsugi-data repo because of their large size. + Instead, we use parquet files containing a subset of columns. + + It's recommended to use the latest possible vintage to get a given year's data. However, + you may specify a specific vintage year if, for example, you need a certain set of county + geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] + are sourced from the 2020 vintage (2010-2020 data), while data for years in the range + [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + + Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv + """ + if vintage_year is None: + if 2010 <= year <= 2019: + vintage_year = 2020 + else: + vintage_year = 2024 + + vintage = get_vintage(vintage_year) + if not (vintage.year_lb <= year <= vintage.year_ub): + raise ValueError( + f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" + ) + + data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") + lf = ( + pl.scan_parquet(data) + .filter( + pl.col("year") == year, + pl.col("age_grp") != "tot", + ) + .select( + "state_name", + "county_name", + "county_fips", + "year", + "age_grp", + "tot_male", + "tot_female", + ) + .unpivot( + index=["state_name", "county_name", "county_fips", "year", "age_grp"], + variable_name="sex", + value_name="tot_pop", + ) + .with_columns(sex=pl.col("sex").str.replace("tot_", "").cast(sex_enum)) + .sort("county_fips", "age_grp", "sex") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf diff --git a/tests/population_test.py b/tests/population_test.py index 0ecaaa8..96ced8a 100644 --- a/tests/population_test.py +++ b/tests/population_test.py @@ -10,7 +10,9 @@ county_pop, county_race_pop, county_sex_pop, - get_vintage, + hispanic_enum, + race_enum, + sex_enum, state_age_pop, state_age_sex_pop, state_pop, @@ -22,7 +24,7 @@ class StatePopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_name: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] state_fips: pl.String = pa.Field(unique=True, in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] @@ -30,6 +32,14 @@ class StatePopulation(BasePolarsModel): class Config: # pyright: ignore [reportIncompatibleVariableOverride] unique: list[str] = ["state_name", "state_fips", "year"] + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_height(cls, data: PolarsData) -> bool: + return data.lazyframe.select(pl.len()).collect().item() == 51 # pyright: ignore [reportAny] + @pytest.mark.parametrize( ("year"), @@ -74,15 +84,14 @@ class StateAgePopulation(BasePolarsModel): state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] age: pl.Int64 = pa.Field(in_range=(0, 85)) # pyright: ignore [reportAny] - tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "state_fips", - "year", - "age", - ] + unique: list[str] = ["state_name", "state_fips", "year", "age"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) @pytest.mark.parametrize( @@ -127,16 +136,15 @@ class StateSexPopulation(BasePolarsModel): state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - sex: pl.Enum = pa.Field(dtype_kwargs={"categories": ["tot", "male", "female"]}) # pyright: ignore [reportAny] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "state_fips", - "year", - "sex", - ] + unique: list[str] = ["state_name", "state_fips", "year", "sex"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) @pytest.mark.parametrize( @@ -182,7 +190,7 @@ class StateRacePopulation(BasePolarsModel): state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] race: pl.Enum = pa.Field( # pyright: ignore [reportAny] - dtype_kwargs={"categories": ["white", "black", "aian", "asian", "nhpi"]} + dtype_kwargs={"categories": race_enum.categories} ) tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] @@ -194,27 +202,9 @@ class Config: # pyright: ignore [reportIncompatibleVariableOverride] "race", ] - -class StateRaceHispanicPopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - race: pl.Enum = pa.Field( # pyright: ignore [reportAny] - dtype_kwargs={"categories": ["white", "black", "aian", "asian", "nhpi"]} - ) - hispanic_origin: pl.Enum = pa.Field( # pyright: ignore [reportAny] - dtype_kwargs={"categories": ["tot", "not_hispanic", "hispanic"]} - ) - tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "state_fips", - "year", - "race", - "hispanic_origin", - ] + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) @pytest.mark.parametrize( @@ -225,277 +215,60 @@ class Config: # pyright: ignore [reportIncompatibleVariableOverride] ("vintage_year"), range(2016, 2025), ) -def test_state_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: +def test_state_race_pop(year: int, vintage_year: VintageYear) -> None: if vintage_year <= 2020: year_lb = 2010 else: year_lb = 2020 if year_lb <= year <= vintage_year: - state_race_pop( - year, vintage_year=vintage_year, incl_hispanic_orig=True - ).collect().pipe(StateRaceHispanicPopulation.validate, lazy=True) + state_race_pop(year, vintage_year=vintage_year).collect().pipe( + StateRacePopulation.validate, lazy=True + ) else: with pytest.raises(ValueError, match="^Must choose a year between"): - state_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) + state_race_pop(year, vintage_year=vintage_year) @pytest.mark.parametrize( ("year"), range(2010, 2025), ) -def test_state_race_hispanic_pop_as_pandas(year: int) -> None: - df = state_race_pop(year, as_pandas=True, incl_hispanic_orig=True) +def test_state_race_pop_as_pandas(year: int) -> None: + df = state_race_pop(year, as_pandas=True) assert isinstance(df, DataFrame) def test_state_race_pop_invalid_vintage_year_exception() -> None: with pytest.raises(ValueError, match="^Must choose a vintage year between"): - state_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] + state_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] -class StateAgeSexPopulation(BasePolarsModel): +class StateRaceHispanicPopulation(BasePolarsModel): state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - age: pl.Int64 = pa.Field(in_range=(0, 85)) # pyright: ignore [reportAny] - sex: pl.Enum = pa.Field(dtype_kwargs={"categories": ["tot", "male", "female"]}) # pyright: ignore [reportAny] - tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = ["state_name", "state_fips", "year", "age", "sex"] - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_state_age_sex_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - state_age_sex_pop(year, vintage_year=vintage_year).collect().pipe( - StateAgeSexPopulation.validate, lazy=True - ) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - state_age_sex_pop(year, vintage_year=vintage_year) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_state_age_sex_pop_as_pandas(year: int) -> None: - df = state_age_sex_pop(year, as_pandas=True) - - assert isinstance(df, DataFrame) - - -def test_state_age_sex_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - state_age_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] - - -age_grps = { - 0: "tot", - 1: "0-4", - 2: "5-9", - 3: "10-14", - 4: "15-19", - 5: "20-24", - 6: "25-29", - 7: "30-34", - 8: "35-39", - 9: "40-44", - 10: "45-49", - 11: "50-54", - 12: "55-59", - 13: "60-64", - 14: "65-69", - 15: "70-74", - 16: "75-79", - 17: "80-84", - 18: ">=85", -} -age_grp_enum = pl.Enum(age_grps.values()) - - -class CountyPopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = ["state_name", "county_name", "county_fips", "year"] - - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_county_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - county_pop(year, vintage_year=vintage_year).collect().pipe( - CountyPopulation.validate, lazy=True - ) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - county_pop(year, vintage_year=vintage_year) - - -def test_county_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] - - -def test_get_vintage_info() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - get_vintage(2000) # pyright: ignore [reportArgumentType] - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_county_pop_as_pandas(year: int) -> None: - df = county_pop(year, as_pandas=True) - - assert isinstance(df, DataFrame) - - -class CountyAgePopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": race_enum.categories} + ) + hispanic_origin: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": hispanic_enum.categories} + ) tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] unique: list[str] = [ "state_name", - "county_name", - "county_fips", - "year", - "age_grp", - ] - - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - county_age_pop(year, vintage_year=vintage_year).collect().pipe( - CountyAgePopulation.validate, lazy=True - ) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - county_age_pop(year, vintage_year=vintage_year) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_county_age_pop_as_pandas(year: int) -> None: - df = county_age_pop(year, as_pandas=True) - - assert isinstance(df, DataFrame) - - -def test_county_age_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] - - -class CountySexPopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - tot_male: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - tot_female: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "county_name", - "county_fips", + "state_fips", "year", + "race", + "hispanic_origin", ] - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) @pytest.mark.parametrize( @@ -506,103 +279,50 @@ def has_correct_states(cls, data: PolarsData) -> bool: ("vintage_year"), range(2016, 2025), ) -def test_county_sex_pop(year: int, vintage_year: VintageYear) -> None: +def test_state_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: if vintage_year <= 2020: year_lb = 2010 else: year_lb = 2020 if year_lb <= year <= vintage_year: - county_sex_pop(year, vintage_year=vintage_year).collect().pipe( - CountySexPopulation.validate, lazy=True - ) + state_race_pop( + year, vintage_year=vintage_year, incl_hispanic_orig=True + ).collect().pipe(StateRaceHispanicPopulation.validate, lazy=True) else: with pytest.raises(ValueError, match="^Must choose a year between"): - county_sex_pop(year, vintage_year=vintage_year) + state_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) @pytest.mark.parametrize( ("year"), range(2010, 2025), ) -def test_county_sex_pop_as_pandas(year: int) -> None: - df = county_sex_pop(year, as_pandas=True) +def test_state_race_hispanic_pop_as_pandas(year: int) -> None: + df = state_race_pop(year, as_pandas=True, incl_hispanic_orig=True) assert isinstance(df, DataFrame) -def test_county_sex_pop_invalid_vintage_year_exception() -> None: +def test_state_race_hispanic_pop_invalid_vintage_year_exception() -> None: with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] - - -class CountyRacePopulation(BasePolarsModel): - state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - - class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "county_name", - "county_fips", - "year", - ] - - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) + state_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] -class CountyRaceHispanicPopulation(BasePolarsModel): +class StateAgeSexPopulation(BasePolarsModel): state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] - white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] - hispanic: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + age: pl.Int64 = pa.Field(in_range=(0, 85)) # pyright: ignore [reportAny] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "county_name", - "county_fips", - "year", - ] + unique: list[str] = ["state_name", "state_fips", "year", "age", "sex"] - @pa.dataframe_check - def has_correct_states(cls, data: PolarsData) -> bool: - return ( - data.lazyframe.select( - pl.col("county_fips") - .str.slice(0, 2) - .is_between(pl.lit("01"), pl.lit("56")) - .all() - ) - .collect() - .item() - is True - ) + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) @pytest.mark.parametrize( @@ -613,69 +333,407 @@ def has_correct_states(cls, data: PolarsData) -> bool: ("vintage_year"), range(2016, 2025), ) -def test_county_race_pop(year: int, vintage_year: VintageYear) -> None: +def test_state_age_sex_pop(year: int, vintage_year: VintageYear) -> None: if vintage_year <= 2020: year_lb = 2010 else: year_lb = 2020 if year_lb <= year <= vintage_year: - county_race_pop(year, vintage_year=vintage_year).collect().pipe( - CountyRacePopulation.validate, lazy=True + state_age_sex_pop(year, vintage_year=vintage_year).collect().pipe( + StateAgeSexPopulation.validate, lazy=True ) else: with pytest.raises(ValueError, match="^Must choose a year between"): - county_race_pop(year, vintage_year=vintage_year) + state_age_sex_pop(year, vintage_year=vintage_year) @pytest.mark.parametrize( ("year"), range(2010, 2025), ) -def test_county_race_pop_as_pandas(year: int) -> None: - df = county_race_pop(year, as_pandas=True) +def test_state_age_sex_pop_as_pandas(year: int) -> None: + df = state_age_sex_pop(year, as_pandas=True) assert isinstance(df, DataFrame) -def test_county_race_pop_invalid_vintage_year_exception() -> None: +def test_state_age_sex_pop_invalid_vintage_year_exception() -> None: with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -@pytest.mark.parametrize( - ("vintage_year"), - range(2016, 2025), -) -def test_county_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: - if vintage_year <= 2020: - year_lb = 2010 - else: - year_lb = 2020 - - if year_lb <= year <= vintage_year: - county_race_pop( - year, vintage_year=vintage_year, incl_hispanic_orig=True - ).collect().pipe(CountyRaceHispanicPopulation.validate, lazy=True) - else: - with pytest.raises(ValueError, match="^Must choose a year between"): - county_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) - - -@pytest.mark.parametrize( - ("year"), - range(2010, 2025), -) -def test_county_race_hispanic_pop_as_pandas(year: int) -> None: - df = county_race_pop(year, as_pandas=True, incl_hispanic_orig=True) - - assert isinstance(df, DataFrame) + state_age_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] -def test_county_race_hispanic_pop_invalid_vintage_year_exception() -> None: - with pytest.raises(ValueError, match="^Must choose a vintage year between"): - county_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] +# age_grps = { +# 0: "tot", +# 1: "0-4", +# 2: "5-9", +# 3: "10-14", +# 4: "15-19", +# 5: "20-24", +# 6: "25-29", +# 7: "30-34", +# 8: "35-39", +# 9: "40-44", +# 10: "45-49", +# 11: "50-54", +# 12: "55-59", +# 13: "60-64", +# 14: "65-69", +# 15: "70-74", +# 16: "75-79", +# 17: "80-84", +# 18: ">=85", +# } +# age_grp_enum = pl.Enum(age_grps.values()) +# +# +# class CountyPopulation(BasePolarsModel): +# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] +# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] +# tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] +# +# class Config: # pyright: ignore [reportIncompatibleVariableOverride] +# unique: list[str] = ["state_name", "county_name", "county_fips", "year"] +# +# @pa.dataframe_check +# def has_correct_states(cls, data: PolarsData) -> bool: +# return ( +# data.lazyframe.select( +# pl.col("county_fips") +# .str.slice(0, 2) +# .is_between(pl.lit("01"), pl.lit("56")) +# .all() +# ) +# .collect() +# .item() +# is True +# ) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# @pytest.mark.parametrize( +# ("vintage_year"), +# range(2016, 2025), +# ) +# def test_county_pop(year: int, vintage_year: VintageYear) -> None: +# if vintage_year <= 2020: +# year_lb = 2010 +# else: +# year_lb = 2020 +# +# if year_lb <= year <= vintage_year: +# county_pop(year, vintage_year=vintage_year).collect().pipe( +# CountyPopulation.validate, lazy=True +# ) +# else: +# with pytest.raises(ValueError, match="^Must choose a year between"): +# county_pop(year, vintage_year=vintage_year) +# +# +# def test_county_pop_invalid_vintage_year_exception() -> None: +# with pytest.raises(ValueError, match="^Must choose a vintage year between"): +# county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] +# +# +# # def test_get_vintage_info() -> None: +# # with pytest.raises(ValueError, match="^Must choose a vintage year between"): +# # get_vintage(2000) # pyright: ignore [reportArgumentType] +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# def test_county_pop_as_pandas(year: int) -> None: +# df = county_pop(year, as_pandas=True) +# +# assert isinstance(df, DataFrame) +# +# +# class CountyAgePopulation(BasePolarsModel): +# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] +# age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] +# tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# +# class Config: # pyright: ignore [reportIncompatibleVariableOverride] +# unique: list[str] = [ +# "state_name", +# "county_name", +# "county_fips", +# "year", +# "age_grp", +# ] +# +# @pa.dataframe_check +# def has_correct_states(cls, data: PolarsData) -> bool: +# return ( +# data.lazyframe.select( +# pl.col("county_fips") +# .str.slice(0, 2) +# .is_between(pl.lit("01"), pl.lit("56")) +# .all() +# ) +# .collect() +# .item() +# is True +# ) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# @pytest.mark.parametrize( +# ("vintage_year"), +# range(2016, 2025), +# ) +# def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: +# if vintage_year <= 2020: +# year_lb = 2010 +# else: +# year_lb = 2020 +# +# if year_lb <= year <= vintage_year: +# county_age_pop(year, vintage_year=vintage_year).collect().pipe( +# CountyAgePopulation.validate, lazy=True +# ) +# else: +# with pytest.raises(ValueError, match="^Must choose a year between"): +# county_age_pop(year, vintage_year=vintage_year) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# def test_county_age_pop_as_pandas(year: int) -> None: +# df = county_age_pop(year, as_pandas=True) +# +# assert isinstance(df, DataFrame) +# +# +# def test_county_age_pop_invalid_vintage_year_exception() -> None: +# with pytest.raises(ValueError, match="^Must choose a vintage year between"): +# county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] +# +# +# class CountySexPopulation(BasePolarsModel): +# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] +# tot_male: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# tot_female: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# +# class Config: # pyright: ignore [reportIncompatibleVariableOverride] +# unique: list[str] = [ +# "state_name", +# "county_name", +# "county_fips", +# "year", +# ] +# +# @pa.dataframe_check +# def has_correct_states(cls, data: PolarsData) -> bool: +# return ( +# data.lazyframe.select( +# pl.col("county_fips") +# .str.slice(0, 2) +# .is_between(pl.lit("01"), pl.lit("56")) +# .all() +# ) +# .collect() +# .item() +# is True +# ) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# @pytest.mark.parametrize( +# ("vintage_year"), +# range(2016, 2025), +# ) +# def test_county_sex_pop(year: int, vintage_year: VintageYear) -> None: +# if vintage_year <= 2020: +# year_lb = 2010 +# else: +# year_lb = 2020 +# +# if year_lb <= year <= vintage_year: +# county_sex_pop(year, vintage_year=vintage_year).collect().pipe( +# CountySexPopulation.validate, lazy=True +# ) +# else: +# with pytest.raises(ValueError, match="^Must choose a year between"): +# county_sex_pop(year, vintage_year=vintage_year) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# def test_county_sex_pop_as_pandas(year: int) -> None: +# df = county_sex_pop(year, as_pandas=True) +# +# assert isinstance(df, DataFrame) +# +# +# def test_county_sex_pop_invalid_vintage_year_exception() -> None: +# with pytest.raises(ValueError, match="^Must choose a vintage year between"): +# county_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] +# +# +# class CountyRacePopulation(BasePolarsModel): +# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] +# white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# +# class Config: # pyright: ignore [reportIncompatibleVariableOverride] +# unique: list[str] = [ +# "state_name", +# "county_name", +# "county_fips", +# "year", +# ] +# +# @pa.dataframe_check +# def has_correct_states(cls, data: PolarsData) -> bool: +# return ( +# data.lazyframe.select( +# pl.col("county_fips") +# .str.slice(0, 2) +# .is_between(pl.lit("01"), pl.lit("56")) +# .all() +# ) +# .collect() +# .item() +# is True +# ) +# +# +# class CountyRaceHispanicPopulation(BasePolarsModel): +# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] +# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] +# white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# hispanic: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] +# +# class Config: # pyright: ignore [reportIncompatibleVariableOverride] +# unique: list[str] = [ +# "state_name", +# "county_name", +# "county_fips", +# "year", +# ] +# +# @pa.dataframe_check +# def has_correct_states(cls, data: PolarsData) -> bool: +# return ( +# data.lazyframe.select( +# pl.col("county_fips") +# .str.slice(0, 2) +# .is_between(pl.lit("01"), pl.lit("56")) +# .all() +# ) +# .collect() +# .item() +# is True +# ) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# @pytest.mark.parametrize( +# ("vintage_year"), +# range(2016, 2025), +# ) +# def test_county_race_pop(year: int, vintage_year: VintageYear) -> None: +# if vintage_year <= 2020: +# year_lb = 2010 +# else: +# year_lb = 2020 +# +# if year_lb <= year <= vintage_year: +# county_race_pop(year, vintage_year=vintage_year).collect().pipe( +# CountyRacePopulation.validate, lazy=True +# ) +# else: +# with pytest.raises(ValueError, match="^Must choose a year between"): +# county_race_pop(year, vintage_year=vintage_year) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# def test_county_race_pop_as_pandas(year: int) -> None: +# df = county_race_pop(year, as_pandas=True) +# +# assert isinstance(df, DataFrame) +# +# +# def test_county_race_pop_invalid_vintage_year_exception() -> None: +# with pytest.raises(ValueError, match="^Must choose a vintage year between"): +# county_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# @pytest.mark.parametrize( +# ("vintage_year"), +# range(2016, 2025), +# ) +# def test_county_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: +# if vintage_year <= 2020: +# year_lb = 2010 +# else: +# year_lb = 2020 +# +# if year_lb <= year <= vintage_year: +# county_race_pop( +# year, vintage_year=vintage_year, incl_hispanic_orig=True +# ).collect().pipe(CountyRaceHispanicPopulation.validate, lazy=True) +# else: +# with pytest.raises(ValueError, match="^Must choose a year between"): +# county_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) +# +# +# @pytest.mark.parametrize( +# ("year"), +# range(2010, 2025), +# ) +# def test_county_race_hispanic_pop_as_pandas(year: int) -> None: +# df = county_race_pop(year, as_pandas=True, incl_hispanic_orig=True) +# +# assert isinstance(df, DataFrame) +# +# +# def test_county_race_hispanic_pop_invalid_vintage_year_exception() -> None: +# with pytest.raises(ValueError, match="^Must choose a vintage year between"): +# county_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] From 16f7a6711eb67e468f0accaa8c4072397add81d2 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Wed, 11 Mar 2026 18:03:47 -0400 Subject: [PATCH 3/4] Finalize code and tests for county population functions --- src/kintsugi/population.py | 132 +++--- tests/population_test.py | 813 ++++++++++++++++++++----------------- 2 files changed, 502 insertions(+), 443 deletions(-) diff --git a/src/kintsugi/population.py b/src/kintsugi/population.py index a98a610..ff5ddef 100644 --- a/src/kintsugi/population.py +++ b/src/kintsugi/population.py @@ -72,7 +72,10 @@ def validate_vintage_year(year: int, vintage_year: VintageYear) -> None: # match conventions in kintsugi-data processing script sex_enum = pl.Enum(["tot", "male", "female"]) -race_enum = pl.Enum(["white", "black", "aian", "asian", "nhpi"]) +race_enum_no_hispanic = pl.Enum(["white", "black", "aian", "asian", "nhpi"]) +race_enum_incl_hispanic = pl.Enum( + ["white", "black", "aian", "asian", "nhpi", "hispanic"] +) hispanic_enum = pl.Enum(["tot", "not_hispanic", "hispanic"]) @@ -275,7 +278,7 @@ def state_race_pop( as_pandas: bool = False, ) -> pl.LazyFrame | pd.DataFrame: """ - State-race population estimates for select years. Specify `incl_hispanic=True` to include + State-race population estimates for select years. Specify `incl_hispanic_orig=True` to include Hispanic counts column. Uses state population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html The raw files are not present in the kintsugi-data repo. Instead, we use parquet files containing a subset of columns. @@ -381,6 +384,30 @@ def state_age_sex_pop( return lf +age_grps = [ + "tot", + "0-4", + "5-9", + "10-14", + "15-19", + "20-24", + "25-29", + "30-34", + "35-39", + "40-44", + "45-49", + "50-54", + "55-59", + "60-64", + "65-69", + "70-74", + "75-79", + "80-84", + ">=85", +] +age_grp_enum = pl.Enum(age_grps) + + @overload def county_pop( year: int, @@ -403,13 +430,12 @@ def county_pop( County population estimates for select years. Uses county population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. + Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv """ @@ -419,12 +445,7 @@ def county_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -464,13 +485,12 @@ def county_age_pop( County-age population estimates for select years. Uses county population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. + Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv """ @@ -480,12 +500,7 @@ def county_age_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -527,13 +542,12 @@ def county_sex_pop( County-sex population estimates for select years. Uses county population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. + Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv """ @@ -543,12 +557,7 @@ def county_sex_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -559,7 +568,13 @@ def county_sex_pop( .select( "state_name", "county_name", "county_fips", "year", "tot_male", "tot_female" ) - .sort("county_fips") + .unpivot( + index=["state_name", "county_name", "county_fips", "year"], + variable_name="sex", + value_name="tot_pop", + ) + .with_columns(sex=pl.col("sex").str.replace("tot_", "").cast(sex_enum)) + .sort("county_fips", "sex") ) if as_pandas: @@ -596,17 +611,16 @@ def county_race_pop( as_pandas: bool = False, ) -> pl.LazyFrame | pd.DataFrame: """ - County-race population estimates for select years. Specify `incl_hispanic=True` to include + County-race population estimates for select years. Specify `incl_hispanic_orig=True` to include Hispanic counts column. Uses county population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. + Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv """ @@ -616,12 +630,7 @@ def county_race_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) @@ -663,12 +672,21 @@ def county_race_pop( "nhpi", "hispanic", ) - .sort("county_fips") ) if not incl_hispanic_orig: lf = lf.drop("hispanic") + lf = ( + lf.unpivot( + index=["state_name", "county_name", "county_fips", "year"], + variable_name="race", + value_name="tot_pop", + ) + .cast({"race": race_enum_incl_hispanic}) + .sort("county_fips", "race") + ) + if as_pandas: return lf.collect().to_pandas() @@ -697,13 +715,12 @@ def county_age_sex_pop( County-age-sex population estimates for select years. Uses county population by characteristics data: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-detail.html The raw files are not present in the kintsugi-data repo because of their large size. - Instead, we use parquet files containing a subset of columns. + Instead, parquet files containing a subset of columns are used. It's recommended to use the latest possible vintage to get a given year's data. However, - you may specify a specific vintage year if, for example, you need a certain set of county - geographies. If `vintage_year` is `None` (by default), data for years in the range [2010, 2019] - are sourced from the 2020 vintage (2010-2020 data), while data for years in the range - [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). + a specific vintage year may be provided. If `vintage_year` is `None` (the default), data + for years in the range [2010, 2019] are sourced from the 2020 vintage (2010-2020 data), + while data for years in the range [2020, 2024] are sourced from the 2024 vintage (2020-2024 data). Source (2024 example): https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/counties/asrh/cc-est2024-alldata.csv """ @@ -713,12 +730,7 @@ def county_age_sex_pop( else: vintage_year = 2024 - vintage = get_vintage(vintage_year) - if not (vintage.year_lb <= year <= vintage.year_ub): - raise ValueError( - f"Must choose a year between {vintage.year_lb} and {vintage.year_ub}" - ) - + validate_vintage_year(year, vintage_year) data = get_dataset(f"pop/county_cc/county_pop_{vintage_year}.parquet") lf = ( pl.scan_parquet(data) diff --git a/tests/population_test.py b/tests/population_test.py index 96ced8a..aef3185 100644 --- a/tests/population_test.py +++ b/tests/population_test.py @@ -6,12 +6,15 @@ from kintsugi.population import ( VintageYear, + age_grp_enum, county_age_pop, + county_age_sex_pop, county_pop, county_race_pop, county_sex_pop, hispanic_enum, - race_enum, + race_enum_incl_hispanic, + race_enum_no_hispanic, sex_enum, state_age_pop, state_age_sex_pop, @@ -190,17 +193,12 @@ class StateRacePopulation(BasePolarsModel): state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] race: pl.Enum = pa.Field( # pyright: ignore [reportAny] - dtype_kwargs={"categories": race_enum.categories} + dtype_kwargs={"categories": race_enum_no_hispanic.categories} ) tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] - unique: list[str] = [ - "state_name", - "state_fips", - "year", - "race", - ] + unique: list[str] = ["state_name", "state_fips", "year", "race"] @pa.check("year") def all_identical(cls, data: PolarsData) -> pl.LazyFrame: @@ -250,7 +248,7 @@ class StateRaceHispanicPopulation(BasePolarsModel): state_fips: pl.String = pa.Field(in_range=("01", "56")) # pyright: ignore [reportAny] year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] race: pl.Enum = pa.Field( # pyright: ignore [reportAny] - dtype_kwargs={"categories": race_enum.categories} + dtype_kwargs={"categories": race_enum_no_hispanic.categories} ) hispanic_origin: pl.Enum = pa.Field( # pyright: ignore [reportAny] dtype_kwargs={"categories": hispanic_enum.categories} @@ -363,377 +361,426 @@ def test_state_age_sex_pop_invalid_vintage_year_exception() -> None: state_age_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] -# age_grps = { -# 0: "tot", -# 1: "0-4", -# 2: "5-9", -# 3: "10-14", -# 4: "15-19", -# 5: "20-24", -# 6: "25-29", -# 7: "30-34", -# 8: "35-39", -# 9: "40-44", -# 10: "45-49", -# 11: "50-54", -# 12: "55-59", -# 13: "60-64", -# 14: "65-69", -# 15: "70-74", -# 16: "75-79", -# 17: "80-84", -# 18: ">=85", -# } -# age_grp_enum = pl.Enum(age_grps.values()) -# -# -# class CountyPopulation(BasePolarsModel): -# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] -# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] -# tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] -# -# class Config: # pyright: ignore [reportIncompatibleVariableOverride] -# unique: list[str] = ["state_name", "county_name", "county_fips", "year"] -# -# @pa.dataframe_check -# def has_correct_states(cls, data: PolarsData) -> bool: -# return ( -# data.lazyframe.select( -# pl.col("county_fips") -# .str.slice(0, 2) -# .is_between(pl.lit("01"), pl.lit("56")) -# .all() -# ) -# .collect() -# .item() -# is True -# ) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# @pytest.mark.parametrize( -# ("vintage_year"), -# range(2016, 2025), -# ) -# def test_county_pop(year: int, vintage_year: VintageYear) -> None: -# if vintage_year <= 2020: -# year_lb = 2010 -# else: -# year_lb = 2020 -# -# if year_lb <= year <= vintage_year: -# county_pop(year, vintage_year=vintage_year).collect().pipe( -# CountyPopulation.validate, lazy=True -# ) -# else: -# with pytest.raises(ValueError, match="^Must choose a year between"): -# county_pop(year, vintage_year=vintage_year) -# -# -# def test_county_pop_invalid_vintage_year_exception() -> None: -# with pytest.raises(ValueError, match="^Must choose a vintage year between"): -# county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] -# -# -# # def test_get_vintage_info() -> None: -# # with pytest.raises(ValueError, match="^Must choose a vintage year between"): -# # get_vintage(2000) # pyright: ignore [reportArgumentType] -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# def test_county_pop_as_pandas(year: int) -> None: -# df = county_pop(year, as_pandas=True) -# -# assert isinstance(df, DataFrame) -# -# -# class CountyAgePopulation(BasePolarsModel): -# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] -# age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] -# tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# -# class Config: # pyright: ignore [reportIncompatibleVariableOverride] -# unique: list[str] = [ -# "state_name", -# "county_name", -# "county_fips", -# "year", -# "age_grp", -# ] -# -# @pa.dataframe_check -# def has_correct_states(cls, data: PolarsData) -> bool: -# return ( -# data.lazyframe.select( -# pl.col("county_fips") -# .str.slice(0, 2) -# .is_between(pl.lit("01"), pl.lit("56")) -# .all() -# ) -# .collect() -# .item() -# is True -# ) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# @pytest.mark.parametrize( -# ("vintage_year"), -# range(2016, 2025), -# ) -# def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: -# if vintage_year <= 2020: -# year_lb = 2010 -# else: -# year_lb = 2020 -# -# if year_lb <= year <= vintage_year: -# county_age_pop(year, vintage_year=vintage_year).collect().pipe( -# CountyAgePopulation.validate, lazy=True -# ) -# else: -# with pytest.raises(ValueError, match="^Must choose a year between"): -# county_age_pop(year, vintage_year=vintage_year) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# def test_county_age_pop_as_pandas(year: int) -> None: -# df = county_age_pop(year, as_pandas=True) -# -# assert isinstance(df, DataFrame) -# -# -# def test_county_age_pop_invalid_vintage_year_exception() -> None: -# with pytest.raises(ValueError, match="^Must choose a vintage year between"): -# county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] -# -# -# class CountySexPopulation(BasePolarsModel): -# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] -# tot_male: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# tot_female: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# -# class Config: # pyright: ignore [reportIncompatibleVariableOverride] -# unique: list[str] = [ -# "state_name", -# "county_name", -# "county_fips", -# "year", -# ] -# -# @pa.dataframe_check -# def has_correct_states(cls, data: PolarsData) -> bool: -# return ( -# data.lazyframe.select( -# pl.col("county_fips") -# .str.slice(0, 2) -# .is_between(pl.lit("01"), pl.lit("56")) -# .all() -# ) -# .collect() -# .item() -# is True -# ) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# @pytest.mark.parametrize( -# ("vintage_year"), -# range(2016, 2025), -# ) -# def test_county_sex_pop(year: int, vintage_year: VintageYear) -> None: -# if vintage_year <= 2020: -# year_lb = 2010 -# else: -# year_lb = 2020 -# -# if year_lb <= year <= vintage_year: -# county_sex_pop(year, vintage_year=vintage_year).collect().pipe( -# CountySexPopulation.validate, lazy=True -# ) -# else: -# with pytest.raises(ValueError, match="^Must choose a year between"): -# county_sex_pop(year, vintage_year=vintage_year) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# def test_county_sex_pop_as_pandas(year: int) -> None: -# df = county_sex_pop(year, as_pandas=True) -# -# assert isinstance(df, DataFrame) -# -# -# def test_county_sex_pop_invalid_vintage_year_exception() -> None: -# with pytest.raises(ValueError, match="^Must choose a vintage year between"): -# county_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] -# -# -# class CountyRacePopulation(BasePolarsModel): -# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] -# white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# -# class Config: # pyright: ignore [reportIncompatibleVariableOverride] -# unique: list[str] = [ -# "state_name", -# "county_name", -# "county_fips", -# "year", -# ] -# -# @pa.dataframe_check -# def has_correct_states(cls, data: PolarsData) -> bool: -# return ( -# data.lazyframe.select( -# pl.col("county_fips") -# .str.slice(0, 2) -# .is_between(pl.lit("01"), pl.lit("56")) -# .all() -# ) -# .collect() -# .item() -# is True -# ) -# -# -# class CountyRaceHispanicPopulation(BasePolarsModel): -# state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] -# year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] -# white: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# black: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# aian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# asian: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# nhpi: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# hispanic: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] -# -# class Config: # pyright: ignore [reportIncompatibleVariableOverride] -# unique: list[str] = [ -# "state_name", -# "county_name", -# "county_fips", -# "year", -# ] -# -# @pa.dataframe_check -# def has_correct_states(cls, data: PolarsData) -> bool: -# return ( -# data.lazyframe.select( -# pl.col("county_fips") -# .str.slice(0, 2) -# .is_between(pl.lit("01"), pl.lit("56")) -# .all() -# ) -# .collect() -# .item() -# is True -# ) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# @pytest.mark.parametrize( -# ("vintage_year"), -# range(2016, 2025), -# ) -# def test_county_race_pop(year: int, vintage_year: VintageYear) -> None: -# if vintage_year <= 2020: -# year_lb = 2010 -# else: -# year_lb = 2020 -# -# if year_lb <= year <= vintage_year: -# county_race_pop(year, vintage_year=vintage_year).collect().pipe( -# CountyRacePopulation.validate, lazy=True -# ) -# else: -# with pytest.raises(ValueError, match="^Must choose a year between"): -# county_race_pop(year, vintage_year=vintage_year) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# def test_county_race_pop_as_pandas(year: int) -> None: -# df = county_race_pop(year, as_pandas=True) -# -# assert isinstance(df, DataFrame) -# -# -# def test_county_race_pop_invalid_vintage_year_exception() -> None: -# with pytest.raises(ValueError, match="^Must choose a vintage year between"): -# county_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# @pytest.mark.parametrize( -# ("vintage_year"), -# range(2016, 2025), -# ) -# def test_county_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: -# if vintage_year <= 2020: -# year_lb = 2010 -# else: -# year_lb = 2020 -# -# if year_lb <= year <= vintage_year: -# county_race_pop( -# year, vintage_year=vintage_year, incl_hispanic_orig=True -# ).collect().pipe(CountyRaceHispanicPopulation.validate, lazy=True) -# else: -# with pytest.raises(ValueError, match="^Must choose a year between"): -# county_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) -# -# -# @pytest.mark.parametrize( -# ("year"), -# range(2010, 2025), -# ) -# def test_county_race_hispanic_pop_as_pandas(year: int) -> None: -# df = county_race_pop(year, as_pandas=True, incl_hispanic_orig=True) -# -# assert isinstance(df, DataFrame) -# -# -# def test_county_race_hispanic_pop_invalid_vintage_year_exception() -> None: -# with pytest.raises(ValueError, match="^Must choose a vintage year between"): -# county_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] +class CountyPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String = pa.Field(unique=True) # pyright: ignore [reportAny] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + tot_pop: pl.Int64 = pa.Field(gt=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_pop(year, vintage_year=vintage_year).collect().pipe( + CountyPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_pop(year, vintage_year=vintage_year) + + +def test_county_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_pop_as_pandas(year: int) -> None: + df = county_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +class CountyAgePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + "age_grp", + ] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_age_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_age_pop(year, vintage_year=vintage_year).collect().pipe( + CountyAgePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_age_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_age_pop_as_pandas(year: int) -> None: + df = county_age_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_age_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_age_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountySexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year", "sex"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_sex_pop(year, vintage_year=vintage_year).collect().pipe( + CountySexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_sex_pop_as_pandas(year: int) -> None: + df = county_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountyRacePopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": race_enum_incl_hispanic.categories} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year", "race"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_race_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_race_pop(year, vintage_year=vintage_year).collect().pipe( + CountyRacePopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_race_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_race_pop_as_pandas(year: int) -> None: + df = county_race_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_race_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_race_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] + + +class CountyRaceHispanicPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + race: pl.Enum = pa.Field( # pyright: ignore [reportAny] + dtype_kwargs={"categories": race_enum_incl_hispanic.categories} + ) + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["state_name", "county_name", "county_fips", "year", "race"] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_race_hispanic_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_race_pop( + year, vintage_year=vintage_year, incl_hispanic_orig=True + ).collect().pipe(CountyRaceHispanicPopulation.validate, lazy=True) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_race_pop(year, vintage_year=vintage_year, incl_hispanic_orig=True) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_race_hispanic_pop_as_pandas(year: int) -> None: + df = county_race_pop(year, as_pandas=True, incl_hispanic_orig=True) + + assert isinstance(df, DataFrame) + + +def test_county_race_hispanic_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_race_pop(2023, vintage_year=2000, incl_hispanic_orig=True) # pyright: ignore [reportArgumentType] + + +class CountyAgeSexPopulation(BasePolarsModel): + state_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + year: pl.Int64 # pyright: ignore [reportUninitializedInstanceVariable] + age_grp: pl.Enum = pa.Field(dtype_kwargs={"categories": age_grp_enum.categories}) # pyright: ignore [reportAny] + sex: pl.Enum = pa.Field(dtype_kwargs={"categories": sex_enum.categories}) # pyright: ignore [reportAny] + tot_pop: pl.Int64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = [ + "state_name", + "county_name", + "county_fips", + "year", + "age_grp", + "sex", + ] + + @pa.check("year") + def all_identical(cls, data: PolarsData) -> pl.LazyFrame: + return data.lazyframe.select((pl.col(data.key).n_unique() == 1).all()) + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +@pytest.mark.parametrize( + ("vintage_year"), + range(2016, 2025), +) +def test_county_age_sex_pop(year: int, vintage_year: VintageYear) -> None: + if vintage_year <= 2020: + year_lb = 2010 + else: + year_lb = 2020 + + if year_lb <= year <= vintage_year: + county_age_sex_pop(year, vintage_year=vintage_year).collect().pipe( + CountyAgeSexPopulation.validate, lazy=True + ) + else: + with pytest.raises(ValueError, match="^Must choose a year between"): + county_age_sex_pop(year, vintage_year=vintage_year) + + +@pytest.mark.parametrize( + ("year"), + range(2010, 2025), +) +def test_county_age_sex_pop_as_pandas(year: int) -> None: + df = county_age_sex_pop(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_age_sex_pop_invalid_vintage_year_exception() -> None: + with pytest.raises(ValueError, match="^Must choose a vintage year between"): + county_age_sex_pop(2023, vintage_year=2000) # pyright: ignore [reportArgumentType] From 2c7371d3a5ce78b675ab819ca2d2c85afd7f6b71 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Wed, 11 Mar 2026 19:02:21 -0400 Subject: [PATCH 4/4] Fix import --- src/kintsugi/county_groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kintsugi/county_groups.py b/src/kintsugi/county_groups.py index 285abc4..f8a9a42 100644 --- a/src/kintsugi/county_groups.py +++ b/src/kintsugi/county_groups.py @@ -4,7 +4,7 @@ import polars as pl from ._data import get_dataset -from .county_pop import county_pop +from .population import county_pop @overload