diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 1dad2b2..ed0606b 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -27,6 +27,7 @@ MAVEDB_BASE_URL, authentication_header, http_download, + is_missing_value, ) from dcd_mapping.schemas import ( ScoreRow, @@ -246,13 +247,13 @@ def _load_scoreset_records( with path.open() as csvfile: reader = csv.DictReader(csvfile) for row in reader: - if row["score"] == "NA": + if is_missing_value(row["score"]): row["score"] = None else: row["score"] = row["score"] - if row["hgvs_nt"] != "NA": + if not is_missing_value(row["hgvs_nt"]): prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None - elif row["hgvs_pro"] != "NA": + elif not is_missing_value(row["hgvs_pro"]): prefix = ( row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None ) diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index ef29402..b72cc56 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -10,6 +10,25 @@ _logger = logging.getLogger(__name__) +# Common representations of missing/null data in CSV files +MISSING_VALUE_REPRESENTATIONS = frozenset( + { + "NA", + "N/A", + "na", + "n/a", + "NaN", + "nan", + "null", + "NULL", + "None", + "none", + "", + "-", + ".", + } +) + MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY") MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL") ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO @@ -24,6 +43,22 @@ LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True) +def is_missing_value(value: str | None) -> bool: + """Check if a value represents missing/null data. + + This function recognizes multiple common representations of missing data + that may appear in CSV files from external sources, making the codebase + more resilient to upstream changes in NA representation. + + :param value: The value to check + :return: True if the value represents missing data, False otherwise + """ + if value is None: + return True + # Strip whitespace and check against known missing value representations + return value.strip() in MISSING_VALUE_REPRESENTATIONS + + def authentication_header() -> dict | None: """Fetch with api key envvar, if available.""" return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index ea2bf25..b36e043 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -32,7 +32,7 @@ get_seqrepo, translate_hgvs_to_vrs, ) -from dcd_mapping.resource_utils import request_with_backoff +from dcd_mapping.resource_utils import is_missing_value, request_with_backoff from dcd_mapping.schemas import ( AlignmentResult, MappedScore, @@ -378,7 +378,11 @@ def _map_protein_coding_pro( :param transcript: The transcript selection information for a score set :return: VRS mapping object if mapping succeeds """ - if row.hgvs_pro in {"_wt", "_sy", "NA"} or len(row.hgvs_pro) == 3: + if ( + row.hgvs_pro in {"_wt", "_sy"} + or is_missing_value(row.hgvs_pro) + or len(row.hgvs_pro) == 3 + ): _logger.warning( "Can't process variant syntax %s for %s", row.hgvs_pro, row.accession ) @@ -700,7 +704,7 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool: :return: True if expression appears populated and valid """ return ( - (hgvs_nt != "NA") + (not is_missing_value(hgvs_nt)) and (hgvs_nt not in {"_wt", "_sy", "="}) and (len(hgvs_nt) != 3) ) @@ -713,7 +717,8 @@ def _hgvs_pro_is_valid(hgvs_pro: str) -> bool: :return: True if expression appears populated and valid """ return ( - (hgvs_pro not in {"_wt", "_sy", "NA"}) + (hgvs_pro not in {"_wt", "_sy"}) + and (not is_missing_value(hgvs_pro)) and (len(hgvs_pro) != 3) and ("fs" not in hgvs_pro) )