Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
fe626df
add reading option for fhir
crisely09 May 28, 2025
4501421
little reformatting
crisely09 May 28, 2025
5d82a63
add fhir dataset example
crisely09 May 28, 2025
1dd393f
small addition to metadata
crisely09 May 28, 2025
92a9c75
added output for serviceRequest loading record-set
crisely09 May 28, 2025
cc18426
simplify a bit the metadata file
crisely09 May 28, 2025
265e93f
Read JSON files faster
crisely09 May 28, 2025
1ee3986
bring back previous definition of the parse_json_content
crisely09 May 28, 2025
30fde7c
few format fixes
crisely09 May 28, 2025
350e9a5
align dataset metadata example
crisely09 May 28, 2025
ed78906
fall back to jsonpath_rw when there is recursive-descent
crisely09 May 28, 2025
a4dce21
fix flake8
crisely09 May 28, 2025
3fb1277
Black format fixes, add tests for classes, other suggested changes
crisely09 May 30, 2025
bf76353
updated output from dataset
crisely09 May 30, 2025
3504bf6
fix isort
crisely09 May 30, 2025
5e5b9b2
fix test expectations
crisely09 May 30, 2025
062ab96
fix format
crisely09 May 30, 2025
5c790b0
fix flakes
crisely09 May 30, 2025
d0f36f6
fix expectation of tests
crisely09 May 30, 2025
c331ae3
if not replaced to if is None
crisely09 Jun 6, 2025
238bedd
read bounding boxes all at once
crisely09 Jul 15, 2025
469e870
lazy load orjson
crisely09 Jul 15, 2025
d88f892
remove imports of orjson
crisely09 Jul 15, 2025
a351116
fix python format black
crisely09 Jul 23, 2025
9b94d70
run black again
crisely09 Jul 23, 2025
7f73bc6
update bounding_box parsing to pass the test
crisely09 Jul 23, 2025
741bdfa
trying to include all cases for bounding boxes
crisely09 Jul 23, 2025
18895c0
fix format and pytype
crisely09 Jul 23, 2025
15c49b0
trying to fix format errors
crisely09 Jul 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
610 changes: 610 additions & 0 deletions datasets/1.1/pharmaccess-momcare-fhir/metadata.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "1", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "0QH6R84NZEVZ6FD87G94UDQ1NT1HWK", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "2", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "DA8DV5VNC520V4AW0DD4PY0TVFJLXG", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-12", "serviceRequest/identifier_value": "3", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "9PAXV9DHENCMAL0MD9WCLGF6DUALRZ", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-14", "serviceRequest/identifier_value": "4", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Magugu Health Centre", "serviceRequest/requester_reference": "6", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "RV98DJ47NE093WQZYUNYR5MKD8RAL2", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-20", "serviceRequest/identifier_value": "5", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Dareda Hospital", "serviceRequest/requester_reference": "3", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "YYJM3EF040EDD1Z3Q4DE3RPEXCM9G9", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "6", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "5K8JQWC7DM4X2RJM3XYQWRU9EJ8V8P", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "7", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "NURM0TRUZV8MC8WFQZPUDUC7PLR2ER", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "8", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Bashnet Hospital", "serviceRequest/requester_reference": "4", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "8H5N62X4V95G8Q2THPD6X08Y745MAY", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "9", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "HYTGMF1Z301UJZE3J3ULPN4PYG5VLL", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "10", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "P27Q2AK2U1FDC1455WWHC32PLY763V", "serviceRequest/subject_type": "Patient"}
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ class EncodingFormat:
JPG = "image/jpeg"
JSON = "application/json"
JSON_LINES = "application/jsonlines"
FHIR = "application/fhir+json"
MP3 = "audio/mpeg"
PARQUET = "application/x-parquet"
TEXT = "text/plain"
Expand Down
104 changes: 78 additions & 26 deletions python/mlcroissant/mlcroissant/_src/core/ml/bounding_box.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,90 @@
"""Module to manage "bounding boxes" annotations on images."""

from typing import Any
from typing import Any, List, Union
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please split the PR into 2 PRs:

  • One for FHIR files (this one)
  • One for bounding boxes

?



def parse(value: Any) -> list[float]:
"""Parses a value to a machine-readable bounding box.
def _parse_one(value: Union[str, List[Any]]) -> List[float]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

str | list[Any]

"""Parse a single bounding box representation into a list of four floats."""
processed_value = []
if isinstance(value, str):
processed_value = value.split()
elif isinstance(value, list):
processed_value = value

Args:
value: The value to parse can be either a single space-separated string or a
list of float-compatible elements.
Returns:
The 4-float list that composes the bounding box.
"""
if isinstance(value, list):
pass
elif isinstance(value, str):
value = value.split(" ")
else:
if len(processed_value) != 4:
raise ValueError(
"Wrong format for a bounding box. Expected: str | list. Got:"
f" {type(value)}. If you need to support more format, feel free to create"
" an issue on GitHub."
f"Input should have a length of 4, but has length {len(processed_value)}."
)

try:
value = [float(element) for element in value]
return [float(coord) for coord in processed_value]
except ValueError as e:
raise ValueError(
"Bounding boxes should have coordinates that can be converted to floats."
f" Got {value}"
"All bounding box coordinates can be converted to floats. "
f"Got: {processed_value}"
) from e
if len(value) != 4:


def _parse_all(value: List) -> List[List[float]]:
"""Parse a list containing multiple bounding boxes."""
# Case 1: List of lists, e.g., [[box1], [box2]]
if isinstance(value[0], list):
return [_parse_one(item) for item in value]

# Case 2: Flat list, e.g., [x1, y1, w1, h1, x2, y2, w2, h2]
# This case is handled by the main parse function's dispatch logic.
# We chunk the flat list into a list of 4-element lists.
try:
coords = [float(v) for v in value]
return [coords[i : i + 4] for i in range(0, len(coords), 4)]
except ValueError as e:
raise ValueError(
"Bounding box could not be parsed. Bounding boxes should have a length of"
f" 4. Got {len(value)}"
)
return value
f"All bounding box coordinates can be converted to floats. Got: {value}"
) from e


def parse(value: Any) -> Union[List[float], List[List[float]]]:
"""Parse a value into one or more bounding boxes.
The return type depends on the input:
- A single bounding box returns a List[float].
- Multiple bounding boxes returns a List[List[float]].
Args:
value: The value to parse. Can be a string, a list of 4 elements,
a list of lists, or a flat list of 4*N elements.
Returns:
A list of four floats, or a list of such lists.
Raises:
ValueError: If the input format is invalid.
"""
if isinstance(value, str):
return _parse_one(value)

if isinstance(value, list):
if not value:
return []

# Decide if we're parsing one or multiple boxes.
if isinstance(value[0], list):
# A list of lists is always multiple bounding boxes.
return _parse_all(value)
else:
# A flat list. Check length to decide.
if len(value) % 4 == 0:
if len(value) == 4:
# A list of 4 items is a single bounding box.
return _parse_one(value)
else:
# A list of 4*N items is multiple bounding boxes.
return _parse_all(value)

# If the input is not a string or a list, or if it's a list with
# an invalid length (e.g., 5), we let _parse_one raise the
# appropriate, specific error.
if isinstance(value, list) and len(value) != 4:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you have a test case for those errors?

return _parse_one(value)

raise ValueError(f"Wrong format. Expected str or list, but got {type(value)}.")
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ def test_parse():
assert bounding_box.parse([1, 2, 3, 4]) == [1.0, 2.0, 3.0, 4.0]
assert bounding_box.parse("1 2 3 4") == [1.0, 2.0, 3.0, 4.0]
assert bounding_box.parse("1.0 2 3.0 4.0") == [1.0, 2.0, 3.0, 4.0]
assert bounding_box.parse([[1, 2, 3, 4], [5, 6, 7, 8]]) == [
[1.0, 2.0, 3.0, 4.0],
[5.0, 6.0, 7.0, 8.0],
]
assert bounding_box.parse([1, 2, 3, 4, 5, 6, 7, 8]) == [
[1.0, 2.0, 3.0, 4.0],
[5.0, 6.0, 7.0, 8.0],
]
with pytest.raises(ValueError, match="Wrong format"):
bounding_box.parse(42)
with pytest.raises(ValueError, match="should have a length of"):
Expand Down
5 changes: 5 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/optional.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def librosa(cls) -> types.ModuleType: # pylint: disable=invalid-name
"""Cached librosa module."""
return _try_import("librosa", package_name="librosa")

@cached_class_property
def orjson(cls) -> types.ModuleType: # pylint: disable=invalid-name
"""Cached orjson module."""
return _try_import("orjson", package_name="orjson")

@cached_class_property
def scipy(cls) -> types.ModuleType: # pylint: disable=invalid-name
"""Cached scipy module."""
Expand Down
Loading