Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ test-schema:
-d tmp $(SOURCE_SCHEMA_PATH)

test-python:
$(RUN) python -m unittest discover
$(RUN) pytest
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sure its a dev dependency

$(RUN) python -m doctest nmdc_schema/nmdc_data.py
$(RUN) python -m doctest nmdc_schema/id_helpers.py
$(RUN) python -m doctest src/scripts/make_typecode_to_class_map.py
Expand Down
4 changes: 2 additions & 2 deletions assets/yq-for-mixs_subset_modified.txt
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ make: *** [Makefile:102: gen-project] Error 1
'.slots.occup_samp.annotations.storage_units = {"tag": "storage_units", "value": "1"}'
'.slots.freq_clean.annotations.storage_units = {"tag": "storage_units", "value": "1/d"}'
'.slots.freq_cook.annotations.storage_units = {"tag": "storage_units", "value": "1/d"}'
'.slots.exp_pipe.annotations.storage_units = {"tag": "storage_units", "value": "1"}'
'.slots.occup_density_samp.annotations.storage_units = {"tag": "storage_units", "value": "1/[sft_i]"}'

# Units alignment excuses for problematic MIxS specifications
'.slots.efficiency_percent.annotations.units_alignment_excuse = {"tag": "units_alignment_excuse", "value": "mixs_inconsistent"}'
Expand All @@ -560,8 +562,6 @@ make: *** [Makefile:102: gen-project] Error 1
'.slots.api.annotations.units_alignment_excuse = {"tag": "units_alignment_excuse", "value": "non_ucum_unit"}'

# Units alignment excuses for MIxS slots with zero occurrences in production data
'.slots.exp_pipe.annotations.units_alignment_excuse = {"tag": "units_alignment_excuse", "value": "pending_analysis"}'
'.slots.occup_density_samp.annotations.units_alignment_excuse = {"tag": "units_alignment_excuse", "value": "pending_analysis"}'
'.slots.soil_text_measure.annotations.units_alignment_excuse = {"tag": "units_alignment_excuse", "value": "pending_analysis"}'

# Complex measurement slots that combine numeric and text data
Expand Down
16 changes: 7 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,13 @@ schema-pattern-linting = "src.scripts.schema_pattern_linting:main"
schema-view-relation-graph = "src.scripts.schema_view_relation_graph:cli"
scrutinize-elements = "src.scripts.scrutinize_elements:process_schema_elements"
slot-range-type-reporter = "src.scripts.slot_range_type_reporter:cli"
# Units analysis CLI tools (consolidated in units/ directory)
units-schema-extract = "units.scripts.schema_extract_preferred_units:main"
units-schema-convert = "units.scripts.schema_convert_to_ucum:main"
units-schema-generate = "units.scripts.schema_generate_yq_commands:main"
units-testdata-extract = "units.scripts.testdata_extract_quantity_values:main"
units-testdata-check = "units.scripts.testdata_check_has_unit:main"
units-ucum-validate = "units.scripts.ucum_validate_units:main"
units-mongodb-analyze = "units.scripts.mongodb_analyze_units:main"
units-schema-extract-slot-unit-pairs = "units.scripts.schema_expand_storage_units:main"
# Units analysis uses yq queries in units/Makefile (Python scripts removed)

[tool.pytest.ini_options]
# Only run tests in tests/ directory to match original unittest discover behavior
# (excludes MongoDB integration tests and other tests outside tests/)
# Module independence is checked by check_schema_self_containment.py instead
testpaths = ["tests"]

[tool.deptry]
extend_exclude = [
Expand Down
4 changes: 2 additions & 2 deletions src/data/valid/Biosample-possibly-exhaustive.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ exp_duct:
exp_pipe:
type: nmdc:QuantityValue
has_raw_value: xxx
has_unit: m2
has_unit: '1'
experimental_factor:
type: nmdc:ControlledTermValue
has_raw_value: unconstrained text, unlike the MIxS environmental triad
Expand Down Expand Up @@ -864,7 +864,7 @@ number_resident:
occup_density_samp: # Float
type: nmdc:QuantityValue
has_raw_value: xxx
has_unit: '1'
has_unit: '1/[sft_i]'
occup_document: estimate
occup_samp:
type: nmdc:QuantityValue
Expand Down
4 changes: 2 additions & 2 deletions src/data/valid/Database-interleaved.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1904,7 +1904,7 @@ biosample_set:
exp_pipe:
type: nmdc:QuantityValue
has_raw_value: xxx
has_unit: m2
has_unit: '1'
experimental_factor:
type: nmdc:ControlledTermValue
has_raw_value: unconstrained text, unlike the MIxS environmental triad
Expand Down Expand Up @@ -2292,7 +2292,7 @@ biosample_set:
occup_density_samp: # Float
type: nmdc:QuantityValue
has_raw_value: xxx
has_unit: '1'
has_unit: '1/[sft_i]'
occup_document: estimate
occup_samp:
type: nmdc:QuantityValue
Expand Down
6 changes: 6 additions & 0 deletions src/schema/attribute_values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,12 @@ enums:
- per day
- daily frequency
description: The Unified Code for Units of Measure (UCUM) representation of per day (frequency).
"1/[sft_i]":
title: per square foot
aliases:
- per square foot
- occupants per square foot
description: The Unified Code for Units of Measure (UCUM) representation of per square foot.
"kHz":
aliases:
- kHz
Expand Down
5 changes: 3 additions & 2 deletions src/schema/core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1524,10 +1524,11 @@ slots:
range: SubstanceRoleEnum
description: The role of a substance in a process
concentration:
abstract: true
range: QuantityValue
description: The concentration of a substance used in a process
annotations:
units_alignment_excuse: pending_analysis
comments:
- Union of child units from source_concentration and final_concentration is %|mmol/L|umol/L|mg/L|g/L
source_concentration:
is_a: concentration
description: When solutions A (containing substance X) and B are combined together, this slot captures the concentration of X in solution A
Expand Down
14 changes: 7 additions & 7 deletions src/schema/mixs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3955,9 +3955,9 @@ slots:
occurrence:
tag: occurrence
value: '1'
units_alignment_excuse:
tag: units_alignment_excuse
value: pending_analysis
storage_units:
tag: storage_units
value: "1"
description: The number of exposed pipes in the room
title: exposed pipes
examples:
Expand Down Expand Up @@ -6334,7 +6334,7 @@ slots:
occurrence:
tag: occurrence
value: m
description: Structured miscellaneous property assertions for this Biosample. Use when a value cannot cleanly fit an existing, policy-governed slot.
description: Structured miscellaneous property assertions. Use when a value cannot cleanly fit an existing, policy-governed slot.
title: miscellaneous parameter
examples:
- value: Bicarbonate ion concentration;2075 micromole per kilogram
Expand Down Expand Up @@ -6579,9 +6579,9 @@ slots:
occurrence:
tag: occurrence
value: '1'
units_alignment_excuse:
tag: units_alignment_excuse
value: pending_analysis
storage_units:
tag: storage_units
value: 1/[sft_i]
description: Average number of occupants at time of sampling per square footage
title: occupant density at sampling
examples:
Expand Down
4 changes: 2 additions & 2 deletions src/schema/nmdc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1037,9 +1037,9 @@ enums:

slots:
biomaterial_purity:
abstract: true
range: QuantityValue
annotations:
units_alignment_excuse: pending_analysis
description: A measure of the purity of a biomaterial sample

generates_calibration:
range: CalibrationInformation
Expand Down
37 changes: 26 additions & 11 deletions tests/test_units_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,52 @@ class TestUnitsAlignment(unittest.TestCase):

def test_quantityvalue_slots_have_storage_units_or_excuse(self):
"""Test that ALL QuantityValue slots have either storage_units or units_alignment_excuse.
Expanded constraint: All QuantityValue slots must have storage_units OR units_alignment_excuse.

Expanded constraint: All concrete QuantityValue slots must have storage_units OR units_alignment_excuse.
This ensures complete coverage of unit constraints for data validation.


Abstract slots MUST NOT have storage_units or units_alignment_excuse since they are not
directly instantiated - only their concrete children are used in actual data.

Consumer behavior: units_alignment_excuse means "skip storage_units validation for documented reason".
"""
schema_view = SchemaView(SCHEMA_FILE)
problematic_slots = []

abstract_slots_with_annotations = []

for slot_name in schema_view.all_slots():
slot = schema_view.get_slot(slot_name)
if not slot:
continue

# Check if slot has QuantityValue range
if slot.range != 'QuantityValue':
continue

annotations = slot.annotations or {}
has_storage_units = 'storage_units' in annotations
has_units_excuse = 'units_alignment_excuse' in annotations

# All QuantityValue slots must have either storage_units or excuse

# Abstract slots MUST NOT have storage_units or units_alignment_excuse
if slot.abstract:
if has_storage_units or has_units_excuse:
abstract_slots_with_annotations.append(slot_name)
continue

# All concrete QuantityValue slots must have either storage_units or excuse
if not has_storage_units and not has_units_excuse:
problematic_slots.append(slot_name)

# Sort for consistent output
problematic_slots.sort()

abstract_slots_with_annotations.sort()

self.assertEqual([], abstract_slots_with_annotations,
msg=f"Found {len(abstract_slots_with_annotations)} abstract QuantityValue slots with storage_units or units_alignment_excuse annotations. "
f"Abstract slots should not have these annotations.")

self.assertEqual([], problematic_slots,
msg=f"Found {len(problematic_slots)} QuantityValue slots missing both storage_units and units_alignment_excuse annotations")
msg=f"Found {len(problematic_slots)} concrete QuantityValue slots missing both storage_units and units_alignment_excuse annotations")

def test_units_alignment_excuse_values_are_valid(self):
"""Test that units_alignment_excuse annotations use approved excuse values."""
Expand Down
74 changes: 24 additions & 50 deletions units/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,27 @@ clean:
# Clean and rebuild everything
rebuild: clean all

# Extract preferred_unit annotations from schema (fast: ~1-2 seconds)
output/schema_preferred_units.tsv: ../nmdc_schema/nmdc_materialized_patterns.yaml
@mkdir -p output
poetry run units-schema-extract --schema-file $< --output $@

# Convert preferred units to UCUM notation (fast: ~1-2 seconds)
output/schema_ucum_input.tsv output/schema_ucum_detailed.tsv: output/schema_preferred_units.tsv
poetry run units-schema-convert --input $< --output output/schema_ucum_input.tsv --detailed output/schema_ucum_detailed.tsv

# Generate yq commands for storage_units annotations (fast: ~1-2 seconds)
output/yq_commands_single_unit.txt output/yq_commands_multi_unit.txt: output/schema_ucum_input.tsv
poetry run units-schema-generate --input $< --output-dir output
# Removed misleading preferred_unit analysis targets (schema_extract, schema_convert, schema_generate)
# These reported "problem units" that were already addressed by storage_units annotations
# See: https://github.com/microbiomedata/nmdc-schema/issues/2687


# Production data validation (uses MongoDB YAML dump)
# ⚠️ SLOWEST TARGET: Processes 32,000+ QuantityValue instances from GB-sized MongoDB dump
# ⚠️ UNIMPLEMENTED: Script units-production-validate does not exist
# TODO: Create units/scripts/production_validate.py and add entry point to pyproject.toml
# Expected implementation: Processes 32,000+ QuantityValue instances from GB-sized MongoDB dump
# Expected time: Minutes to hours depending on dump size
#
# Set SCHEMA_FILE to control which schema version to validate against:
# ENV=dev (default): ../nmdc_schema/nmdc_materialized_patterns.yaml (current development schema)
# ENV=prod: ../local/nmdc_schema_last_release.yaml (latest release schema)
SCHEMA_FILE ?= $(if $(filter prod,$(ENV)),../local/nmdc_schema_last_release.yaml,../nmdc_schema/nmdc_materialized_patterns.yaml)

slow-outputs/production_validation_results.tsv: ../local/mongo_via_api_as_unvalidated_nmdc_database.yaml
@mkdir -p slow-outputs
poetry run units-production-validate --input $< --output $@ --schema-file $(SCHEMA_FILE)
@echo "Note: Requires MongoDB dump from: make local/mongo_via_api_as_unvalidated_nmdc_database.yaml"
# DISABLED - missing script units-production-validate
# slow-outputs/production_validation_results.tsv: ../local/mongo_via_api_as_unvalidated_nmdc_database.yaml
# @mkdir -p slow-outputs
# poetry run units-production-validate --input $< --output $@ --schema-file $(SCHEMA_FILE)
# @echo "Note: Requires MongoDB dump from: make local/mongo_via_api_as_unvalidated_nmdc_database.yaml"


# Report slots with units excuses (very fast: <1 second)
Expand All @@ -53,32 +48,14 @@ output/user_friendly_units.tsv: $(SCHEMA_FILE)
@mkdir -p output
yq eval '.enums.UnitEnum.permissible_values | to_entries | map(select(.value | has("title"))) | .[] | .key + " " + .value.title' $< > $@

# Extract slot-to-storage-unit pairs (expands pipe-separated values)
output/schema_storage_units_expanded.tsv: $(SCHEMA_FILE)
poetry run units-schema-extract-slot-unit-pairs --schema-file $< --output $@

# Legacy MongoDB workflow (requires external production data) (moderate: ~30 seconds)
output/mongodb_analysis_results.csv: semi-static-inputs/mongodb-slots-to-units.csv
@mkdir -p output
poetry run units-mongodb-analyze --input $< --output $@

# Legacy MongoDB unit validation analysis (uses production SPARQL query results)
# Note: Analyzes semi-static-inputs/mongodb-slots-to-units.csv from production RDF/SPARQL queries


# Python script targets - newly consolidated units analysis tools
# Validate UCUM compliance of units from test data (moderate: ~10-30 seconds)
output/ucum_validation_results.csv: output/testdata_quantity_values.tsv
poetry run units-ucum-validate --input $< --output $@

# Check QuantityValue has_unit completeness (very fast: ~1-2 seconds)
output/testdata_has_unit_check.tsv: ../src/data/valid/Biosample-possibly-exhaustive.yaml
poetry run units-testdata-check --file-path $< --output $@

# Extract QuantityValue structures from test data (very fast: ~1-2 seconds)
output/testdata_quantity_values.tsv: ../src/data/valid/Biosample-possibly-exhaustive.yaml
@mkdir -p output
poetry run units-testdata-extract --input $< --output $@
# All Python script targets removed - they produced outputs with no consumers
# Deleted scripts:
# - testdata_extract_quantity_values.py (orphaned after UCUM validator removal)
# - testdata_check_has_unit.py (better tested with pytest)
# - mongodb_analyze_units.py (stale data, no consumers)
# - schema_expand_storage_units.py (no consumers)
#
# For UCUM validation, use: pytest tests/test_has_unit_enum.py (uses ucumvert)

# yq query targets from STORAGE_UNITS_STATUS.md analysis (all fast: 1-3 seconds each)

Expand All @@ -105,8 +82,8 @@ output/schema_qv_storage_counts.tsv: $(SCHEMA_FILE)
@echo "qv_slots_without_storage_units $$(yq '.slots | to_entries | map(select(.value.range == "QuantityValue")) | .[] | .key + " | " + (.value.annotations.storage_units.value // "MISSING")' $< | grep -c "MISSING")" >> $@
@echo "qv_slots_total $$(yq '.slots | to_entries | map(select(.value.range == "QuantityValue")) | length' $<)" >> $@

# Fast target - comprehensive analysis pipeline
fast: output/schema_preferred_units.tsv output/schema_ucum_input.tsv output/schema_ucum_detailed.tsv output/yq_commands_single_unit.txt output/yq_commands_multi_unit.txt output/testdata_quantity_values.tsv output/ucum_validation_results.csv output/schema_qv_with_storage.txt output/schema_qv_without_storage.txt output/schema_qv_complete_table.txt output/schema_qv_storage_counts.tsv output/testdata_has_unit_check.tsv output/schema_units_excuses.tsv output/user_friendly_units.tsv output/schema_storage_units_expanded.tsv
# Fast target - comprehensive analysis pipeline (yq queries only)
fast: output/schema_qv_with_storage.txt output/schema_qv_without_storage.txt output/schema_qv_complete_table.txt output/schema_qv_storage_counts.tsv output/schema_units_excuses.tsv output/user_friendly_units.tsv

# Clean all output files (preserves slow-outputs/ directory)
clean-fast:
Expand All @@ -120,11 +97,8 @@ help:
@echo ""
@echo "Key Targets:"
@echo " all - Alias for fast (comprehensive analysis)"
@echo " fast - Comprehensive analysis except slow production validation"
@echo " fast - All fast targets (schema + testdata analysis)"
@echo " clean - Remove all generated files"
@echo " clean-fast - Clean output/ (preserves slow-outputs/)"
@echo ""
@echo "Slow Targets:"
@echo " slow-outputs/production_validation_results.tsv - Production data validation"
@echo ""
@echo "Environment: ENV=dev|prod (controls schema version for production validation)"
@echo "Note: MongoDB production validation target is currently disabled (missing script)"
Loading