diff --git a/Makefile b/Makefile index 694eea7d04..6577beb0cb 100644 --- a/Makefile +++ b/Makefile @@ -149,11 +149,15 @@ test-schema: --include rdf \ -d tmp $(SOURCE_SCHEMA_PATH) +# Note: The final command in this recipe uses `pytest` to run tests. However, +# `pytest` is not currently listed as a dependency in `pyproject.toml`. +# TODO: Add `pytest` as a development dependency in `pyproject.toml`. test-python: $(RUN) python -m unittest discover $(RUN) python -m doctest nmdc_schema/nmdc_data.py $(RUN) python -m doctest nmdc_schema/id_helpers.py $(RUN) python -m doctest src/scripts/make_typecode_to_class_map.py + $(RUN) python -m pytest -v tests/test_data.py lint: $(RUN) linkml-lint $(SOURCE_SCHEMA_PATH) > local/lint.log diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000000..7340870b6d --- /dev/null +++ b/src/data/__init__.py @@ -0,0 +1,115 @@ +"""Sample data, and functions that facilitate accessing that sample data. + +Note: This module was initialized as a copy/paste/adapt of the `src/sample_data/__init__.py` module in + another repository containing a LinkML schema—i.e. the `bertron-schema` repository—linked here: + https://github.com/ber-data/bertron-schema/blob/445b359ee6dcd7472dfe19e86db50f56536bf645/src/sample_data/__init__.py +""" + +import json +from importlib import resources +from importlib.abc import Traversable +from pathlib import Path +from typing import Any + +import yaml + +# Define the path someone could use to `import` the Python package _containing_ the +# `invalid/` and `valid/` directories (e.g. `import {something}`); which, currently, +# happens to be the directory containing this `__init__.py` file. +PACKAGE_IMPORT_PATH = "data" + +# Define a mapping from file extension to a function that can be used to parse the +# content of a file having that extension. +PARSERS_BY_FILE_EXTENSION = { + "yaml": yaml.safe_load, + "yml": yaml.safe_load, + "json": json.loads, +} + + +def _get_traversable() -> Traversable: + """Get a `Traversable` object for the `data/` package. + + The `Traversable` object can be used to access resources contained + within the `data` package, whether this function is called + from this package's source tree, or in an installed distribution. + + Returns: + A `Traversable` object for the `data/` package. + + References: + - https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files + - https://docs.python.org/3/library/importlib.resources.abc.html#importlib.resources.abc.Traversable + + """ + # Create a `Traversable` object that can be passed to the `resources.as_file()` function. + return resources.files(PACKAGE_IMPORT_PATH) + + +def get_sample_data_file_paths() -> list[str]: + """List the paths to all available sample data files. + + These are the paths that can be passed to the `get_sample_data` and + `get_sample_data_text` functions. + + Returns: + A list of file paths (relative to the `data/` directory) to all + ".yaml", ".yml", and ".json" files residing within `data/`. + + """ + traversable = _get_traversable() + file_extension_patterns = [f"**/*.{ext}" for ext in PARSERS_BY_FILE_EXTENSION] + with resources.as_file(traversable) as path: + paths = [ + str(p.relative_to(path)) + for pattern in file_extension_patterns + for p in path.glob(pattern) + ] + return sorted(paths) + + +def get_sample_data_text(file_path: str, encoding: str = "utf-8") -> str: + """Get the text content of a sample data file. + + Args: + file_path: The path to the sample data file, relative to the `data/` directory. + encoding: The text encoding to use when reading the file. + + Returns: + The text content of the specified sample data file. + + References: + - https://docs.python.org/3/library/importlib.resources.html#importlib.resources.as_file + + """ + traversable = _get_traversable().joinpath(file_path) + with resources.as_file(traversable) as path: + return path.read_text(encoding=encoding) + + +def get_sample_data(file_path: str, encoding: str = "utf-8") -> Any: # noqa: ANN401 + """Get a Python value representing the content of a JSON/YAML-formatted sample data file. + + Args: + file_path: The path to the sample data file, relative to the `data/` directory. + encoding: The text encoding to use when reading the file. + + Returns: + The content of the specified sample data file, parsed into a Python value + (in practice this is typically a dictionary or list). + + """ + # Determine which parsing function we will use, based upon the file's extension. + path = Path(file_path) + file_extension = path.suffix.lstrip(".") # ".yaml" -> "yaml" + if file_extension in PARSERS_BY_FILE_EXTENSION: + parse = PARSERS_BY_FILE_EXTENSION[file_extension] + else: + # Raise an error indicating that we don't support files having that extension. + # Note: The `!r` after the in-string variable below calls `repr()` on the value. + # Since the value is a string, the string will appear wrapped in quotes. + msg = f"Filename extension suggests an unsupported file type: {file_path!r}" + raise ValueError(msg) + + text = get_sample_data_text(file_path, encoding=encoding) + return parse(text) diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000000..b7e84770bd --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,97 @@ +"""Tests targeting functions that expose sample data.""" + +import json +from collections.abc import Generator +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest +import yaml + +# TODO: Consider reconfiguring the package so that people can import the sample data +# getter functions from the familiar `nmdc_schema` package, rather than from `src`. +from src.data import get_sample_data, get_sample_data_file_paths, get_sample_data_text + + +@pytest.fixture +def sample_json_content() -> str: + """Fixture that returns the text content of a sample JSON file.""" + return r""" +{ + "id": "001", + "name": "foo bar", + "primary email": "foo.bar@example.com", + "age_in_years": 33 +} +""" + +@pytest.fixture +def sample_yaml_content() -> str: + """Fixture that returns the text content of a sample YAML file.""" + return r""" +# Some YAML documents begin with "front matter". +--- +id: "001" +name: foo bar +primary email: foo.bar@example.com +age_in_years: 33 +""" + +@pytest.fixture(autouse=True) +def mock__get_traversable( + monkeypatch: Generator[pytest.MonkeyPatch, None, None], + sample_yaml_content: str, + sample_json_content: str, +) -> Generator[None, None, None]: + """Fixture that mocks the `sample_data._get_traversable` helper function. + + This fixture (a) creates a temporary directory, (b) populates it with sample data files, + and (c) patches the `_get_traversable` function so it returns a `Path` object pointing + to that temporary directory. This decouples the tests from the contents of the real + `data/` directory that the module-under-test accesses in production. + + Note: All `Path` objects are also `Traversable` object. + """ + with TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + (temp_dir_path / "data.json").write_text(sample_json_content) + (temp_dir_path / "data.yaml").write_text(sample_yaml_content) + (temp_dir_path / "data.yml").write_text(sample_yaml_content) + (temp_dir_path / "data.txt").write_text("some text") # unsupported file suffix + monkeypatch.setattr("src.data._get_traversable", lambda: temp_dir_path) + yield None + + +def test_get_sample_data_file_paths_returns_list_of_file_paths_supported() -> None: + """Test that `get_sample_data_file_paths` returns a list of the file paths we support.""" + assert get_sample_data_file_paths() == ["data.json", "data.yaml", "data.yml"] + + +def test_get_sample_data_text_returns_expected_sample_data_as_string( + sample_json_content: str, + sample_yaml_content: str, +) -> None: + """Test that `get_sample_data_text` returns the sample data we expect, as a string.""" + for path in get_sample_data_file_paths(): + if path == "data.json": + assert sample_json_content == get_sample_data_text(path) + if path in ("data.yaml", "data.yml"): + assert sample_yaml_content == get_sample_data_text(path) + + +def test_get_sample_data_returns_sample_data_as_python_object( + sample_json_content: str, + sample_yaml_content: str, +) -> None: + """Test that `get_sample_data` returns sample data as a Python object.""" + for path in get_sample_data_file_paths(): + if path == "data.json": + assert json.loads(sample_json_content) == get_sample_data(path) + if path in ("data.yaml", "data.yml"): + assert yaml.safe_load(sample_yaml_content) == get_sample_data(path) + + +def test_get_sample_data_rejects_unsupported_filename_extensions() -> None: + """Test that `get_sample_data` raises an exception for an unsupported filename extension.""" + with pytest.raises(ValueError, match=r"^Filename extension"): + get_sample_data("my_file.txt")