Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,15 @@ test-schema:
--include rdf \
-d tmp $(SOURCE_SCHEMA_PATH)

# Note: The final command in this recipe uses `pytest` to run tests. However,
# `pytest` is not currently listed as a dependency in `pyproject.toml`.
# TODO: Add `pytest` as a development dependency in `pyproject.toml`.
test-python:
$(RUN) python -m unittest discover
$(RUN) python -m doctest nmdc_schema/nmdc_data.py
$(RUN) python -m doctest nmdc_schema/id_helpers.py
$(RUN) python -m doctest src/scripts/make_typecode_to_class_map.py
$(RUN) python -m pytest -v tests/test_data.py

lint:
$(RUN) linkml-lint $(SOURCE_SCHEMA_PATH) > local/lint.log
Expand Down
115 changes: 115 additions & 0 deletions src/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Sample data, and functions that facilitate accessing that sample data.

Note: This module was initialized as a copy/paste/adapt of the `src/sample_data/__init__.py` module in
another repository containing a LinkML schema—i.e. the `bertron-schema` repository—linked here:
https://github.com/ber-data/bertron-schema/blob/445b359ee6dcd7472dfe19e86db50f56536bf645/src/sample_data/__init__.py
"""

import json
from importlib import resources
from importlib.abc import Traversable
from pathlib import Path
from typing import Any

import yaml

# Define the path someone could use to `import` the Python package _containing_ the
# `invalid/` and `valid/` directories (e.g. `import {something}`); which, currently,
# happens to be the directory containing this `__init__.py` file.
PACKAGE_IMPORT_PATH = "data"

# Define a mapping from file extension to a function that can be used to parse the
# content of a file having that extension.
PARSERS_BY_FILE_EXTENSION = {
"yaml": yaml.safe_load,
"yml": yaml.safe_load,
"json": json.loads,
}


def _get_traversable() -> Traversable:
"""Get a `Traversable` object for the `data/` package.

The `Traversable` object can be used to access resources contained
within the `data` package, whether this function is called
from this package's source tree, or in an installed distribution.

Returns:
A `Traversable` object for the `data/` package.

References:
- https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files
- https://docs.python.org/3/library/importlib.resources.abc.html#importlib.resources.abc.Traversable

"""
# Create a `Traversable` object that can be passed to the `resources.as_file()` function.
return resources.files(PACKAGE_IMPORT_PATH)


def get_sample_data_file_paths() -> list[str]:
"""List the paths to all available sample data files.

These are the paths that can be passed to the `get_sample_data` and
`get_sample_data_text` functions.

Returns:
A list of file paths (relative to the `data/` directory) to all
".yaml", ".yml", and ".json" files residing within `data/`.

"""
traversable = _get_traversable()
file_extension_patterns = [f"**/*.{ext}" for ext in PARSERS_BY_FILE_EXTENSION]
with resources.as_file(traversable) as path:
paths = [
str(p.relative_to(path))
for pattern in file_extension_patterns
for p in path.glob(pattern)
]
return sorted(paths)


def get_sample_data_text(file_path: str, encoding: str = "utf-8") -> str:
"""Get the text content of a sample data file.

Args:
file_path: The path to the sample data file, relative to the `data/` directory.
encoding: The text encoding to use when reading the file.

Returns:
The text content of the specified sample data file.

References:
- https://docs.python.org/3/library/importlib.resources.html#importlib.resources.as_file

"""
traversable = _get_traversable().joinpath(file_path)
with resources.as_file(traversable) as path:
return path.read_text(encoding=encoding)


def get_sample_data(file_path: str, encoding: str = "utf-8") -> Any: # noqa: ANN401
"""Get a Python value representing the content of a JSON/YAML-formatted sample data file.

Args:
file_path: The path to the sample data file, relative to the `data/` directory.
encoding: The text encoding to use when reading the file.

Returns:
The content of the specified sample data file, parsed into a Python value
(in practice this is typically a dictionary or list).

"""
# Determine which parsing function we will use, based upon the file's extension.
path = Path(file_path)
file_extension = path.suffix.lstrip(".") # ".yaml" -> "yaml"
if file_extension in PARSERS_BY_FILE_EXTENSION:
parse = PARSERS_BY_FILE_EXTENSION[file_extension]
else:
# Raise an error indicating that we don't support files having that extension.
# Note: The `!r` after the in-string variable below calls `repr()` on the value.
# Since the value is a string, the string will appear wrapped in quotes.
msg = f"Filename extension suggests an unsupported file type: {file_path!r}"
raise ValueError(msg)

text = get_sample_data_text(file_path, encoding=encoding)
return parse(text)
97 changes: 97 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Tests targeting functions that expose sample data."""

import json
from collections.abc import Generator
from pathlib import Path
from tempfile import TemporaryDirectory

import pytest
import yaml

# TODO: Consider reconfiguring the package so that people can import the sample data
# getter functions from the familiar `nmdc_schema` package, rather than from `src`.
from src.data import get_sample_data, get_sample_data_file_paths, get_sample_data_text


@pytest.fixture
def sample_json_content() -> str:
"""Fixture that returns the text content of a sample JSON file."""
return r"""
{
"id": "001",
"name": "foo bar",
"primary email": "[email protected]",
"age_in_years": 33
}
"""

@pytest.fixture
def sample_yaml_content() -> str:
"""Fixture that returns the text content of a sample YAML file."""
return r"""
# Some YAML documents begin with "front matter".
---
id: "001"
name: foo bar
primary email: [email protected]
age_in_years: 33
"""

@pytest.fixture(autouse=True)
def mock__get_traversable(
monkeypatch: Generator[pytest.MonkeyPatch, None, None],
sample_yaml_content: str,
sample_json_content: str,
) -> Generator[None, None, None]:
"""Fixture that mocks the `sample_data._get_traversable` helper function.

This fixture (a) creates a temporary directory, (b) populates it with sample data files,
and (c) patches the `_get_traversable` function so it returns a `Path` object pointing
to that temporary directory. This decouples the tests from the contents of the real
`data/` directory that the module-under-test accesses in production.

Note: All `Path` objects are also `Traversable` object.
"""
with TemporaryDirectory() as temp_dir:
temp_dir_path = Path(temp_dir)
(temp_dir_path / "data.json").write_text(sample_json_content)
(temp_dir_path / "data.yaml").write_text(sample_yaml_content)
(temp_dir_path / "data.yml").write_text(sample_yaml_content)
(temp_dir_path / "data.txt").write_text("some text") # unsupported file suffix
monkeypatch.setattr("src.data._get_traversable", lambda: temp_dir_path)
yield None


def test_get_sample_data_file_paths_returns_list_of_file_paths_supported() -> None:
"""Test that `get_sample_data_file_paths` returns a list of the file paths we support."""
assert get_sample_data_file_paths() == ["data.json", "data.yaml", "data.yml"]


def test_get_sample_data_text_returns_expected_sample_data_as_string(
sample_json_content: str,
sample_yaml_content: str,
) -> None:
"""Test that `get_sample_data_text` returns the sample data we expect, as a string."""
for path in get_sample_data_file_paths():
if path == "data.json":
assert sample_json_content == get_sample_data_text(path)
if path in ("data.yaml", "data.yml"):
assert sample_yaml_content == get_sample_data_text(path)


def test_get_sample_data_returns_sample_data_as_python_object(
sample_json_content: str,
sample_yaml_content: str,
) -> None:
"""Test that `get_sample_data` returns sample data as a Python object."""
for path in get_sample_data_file_paths():
if path == "data.json":
assert json.loads(sample_json_content) == get_sample_data(path)
if path in ("data.yaml", "data.yml"):
assert yaml.safe_load(sample_yaml_content) == get_sample_data(path)


def test_get_sample_data_rejects_unsupported_filename_extensions() -> None:
"""Test that `get_sample_data` raises an exception for an unsupported filename extension."""
with pytest.raises(ValueError, match=r"^Filename extension"):
get_sample_data("my_file.txt")