Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/dcd_mapping/mavedb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
MAVEDB_BASE_URL,
authentication_header,
http_download,
is_missing_value,
)
from dcd_mapping.schemas import (
ScoreRow,
Expand Down Expand Up @@ -246,13 +247,13 @@ def _load_scoreset_records(
with path.open() as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row["score"] == "NA":
if is_missing_value(row["score"]):
row["score"] = None
else:
row["score"] = row["score"]
if row["hgvs_nt"] != "NA":
if not is_missing_value(row["hgvs_nt"]):
prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None
elif row["hgvs_pro"] != "NA":
elif not is_missing_value(row["hgvs_pro"]):
prefix = (
row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None
)
Expand Down
35 changes: 35 additions & 0 deletions src/dcd_mapping/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,25 @@

_logger = logging.getLogger(__name__)

# Common representations of missing/null data in CSV files
MISSING_VALUE_REPRESENTATIONS = frozenset(
{
"NA",
"N/A",
"na",
"n/a",
"NaN",
"nan",
"null",
"NULL",
"None",
"none",
"",
"-",
".",
}
)

MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY")
MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL")
ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO
Expand All @@ -24,6 +43,22 @@
LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)


def is_missing_value(value: str | None) -> bool:
"""Check if a value represents missing/null data.

This function recognizes multiple common representations of missing data
that may appear in CSV files from external sources, making the codebase
more resilient to upstream changes in NA representation.

:param value: The value to check
:return: True if the value represents missing data, False otherwise
"""
if value is None:
return True
# Strip whitespace and check against known missing value representations
return value.strip() in MISSING_VALUE_REPRESENTATIONS


def authentication_header() -> dict | None:
"""Fetch with api key envvar, if available."""
return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None
Expand Down
13 changes: 9 additions & 4 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
get_seqrepo,
translate_hgvs_to_vrs,
)
from dcd_mapping.resource_utils import request_with_backoff
from dcd_mapping.resource_utils import is_missing_value, request_with_backoff
from dcd_mapping.schemas import (
AlignmentResult,
MappedScore,
Expand Down Expand Up @@ -378,7 +378,11 @@ def _map_protein_coding_pro(
:param transcript: The transcript selection information for a score set
:return: VRS mapping object if mapping succeeds
"""
if row.hgvs_pro in {"_wt", "_sy", "NA"} or len(row.hgvs_pro) == 3:
if (
row.hgvs_pro in {"_wt", "_sy"}
or is_missing_value(row.hgvs_pro)
or len(row.hgvs_pro) == 3
):
_logger.warning(
"Can't process variant syntax %s for %s", row.hgvs_pro, row.accession
)
Expand Down Expand Up @@ -700,7 +704,7 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool:
:return: True if expression appears populated and valid
"""
return (
(hgvs_nt != "NA")
(not is_missing_value(hgvs_nt))
and (hgvs_nt not in {"_wt", "_sy", "="})
and (len(hgvs_nt) != 3)
)
Expand All @@ -713,7 +717,8 @@ def _hgvs_pro_is_valid(hgvs_pro: str) -> bool:
:return: True if expression appears populated and valid
"""
return (
(hgvs_pro not in {"_wt", "_sy", "NA"})
(hgvs_pro not in {"_wt", "_sy"})
and (not is_missing_value(hgvs_pro))
and (len(hgvs_pro) != 3)
and ("fs" not in hgvs_pro)
)
Expand Down