Skip to content
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
fe626df
add reading option for fhir
crisely09 May 28, 2025
4501421
little reformatting
crisely09 May 28, 2025
5d82a63
add fhir dataset example
crisely09 May 28, 2025
1dd393f
small addition to metadata
crisely09 May 28, 2025
92a9c75
added output for serviceRequest loading record-set
crisely09 May 28, 2025
cc18426
simplify a bit the metadata file
crisely09 May 28, 2025
265e93f
Read JSON files faster
crisely09 May 28, 2025
1ee3986
bring back previous definition of the parse_json_content
crisely09 May 28, 2025
30fde7c
few format fixes
crisely09 May 28, 2025
350e9a5
align dataset metadata example
crisely09 May 28, 2025
ed78906
fall back to jsonpath_rw when there is recursive-descent
crisely09 May 28, 2025
a4dce21
fix flake8
crisely09 May 28, 2025
3fb1277
Black format fixes, add tests for classes, other suggested changes
crisely09 May 30, 2025
bf76353
updated output from dataset
crisely09 May 30, 2025
3504bf6
fix isort
crisely09 May 30, 2025
5e5b9b2
fix test expectations
crisely09 May 30, 2025
062ab96
fix format
crisely09 May 30, 2025
5c790b0
fix flakes
crisely09 May 30, 2025
d0f36f6
fix expectation of tests
crisely09 May 30, 2025
c331ae3
if not replaced to if is None
crisely09 Jun 6, 2025
238bedd
read bounding boxes all at once
crisely09 Jul 15, 2025
469e870
lazy load orjson
crisely09 Jul 15, 2025
d88f892
remove imports of orjson
crisely09 Jul 15, 2025
a351116
fix python format black
crisely09 Jul 23, 2025
9b94d70
run black again
crisely09 Jul 23, 2025
7f73bc6
update bounding_box parsing to pass the test
crisely09 Jul 23, 2025
741bdfa
trying to include all cases for bounding boxes
crisely09 Jul 23, 2025
18895c0
fix format and pytype
crisely09 Jul 23, 2025
15c49b0
trying to fix format errors
crisely09 Jul 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
610 changes: 610 additions & 0 deletions datasets/1.1/pharmaccess-momcare-fhir/metadata.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "1", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "0QH6R84NZEVZ6FD87G94UDQ1NT1HWK", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "2", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "DA8DV5VNC520V4AW0DD4PY0TVFJLXG", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-12", "serviceRequest/identifier_value": "3", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "9PAXV9DHENCMAL0MD9WCLGF6DUALRZ", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-14", "serviceRequest/identifier_value": "4", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Magugu Health Centre", "serviceRequest/requester_reference": "6", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "RV98DJ47NE093WQZYUNYR5MKD8RAL2", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-20", "serviceRequest/identifier_value": "5", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Dareda Hospital", "serviceRequest/requester_reference": "3", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "YYJM3EF040EDD1Z3Q4DE3RPEXCM9G9", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "6", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "5K8JQWC7DM4X2RJM3XYQWRU9EJ8V8P", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "7", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "NURM0TRUZV8MC8WFQZPUDUC7PLR2ER", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "8", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Bashnet Hospital", "serviceRequest/requester_reference": "4", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "8H5N62X4V95G8Q2THPD6X08Y745MAY", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "9", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "HYTGMF1Z301UJZE3J3ULPN4PYG5VLL", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "10", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "P27Q2AK2U1FDC1455WWHC32PLY763V", "serviceRequest/subject_type": "Patient"}
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ class EncodingFormat:
JPG = "image/jpeg"
JSON = "application/json"
JSON_LINES = "application/jsonlines"
FHIR = "application/fhir+json"
MP3 = "audio/mpeg"
PARQUET = "application/x-parquet"
TEXT = "text/plain"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,223 @@
"""Parse JSON operation."""

from typing import Any, TextIO

import jsonpath_rw
import orjson
import pandas as pd

from mlcroissant._src.core.types import Json
from mlcroissant._src.structure_graph.nodes.field import Field
from mlcroissant._src.structure_graph.nodes.source import FileProperty


def _unwrap_single_item(value: Any) -> Any:
"""Unwraps a single-item list to its value, or returns the value as is."""
if isinstance(value, list) and len(value) == 1:
if value[0] is None:
return None
return value[0]
return value


def parse_json_content(json: Json, fields: tuple[Field, ...]) -> pd.DataFrame:
def parse_json_content(json_obj, fields):
"""Parsed all JSONs defined in the fields of RecordSet and outputs a pd.DF."""
series = {}
for field in fields:
json_path = field.source.extract.json_path
if json_path is None:
if not json_path:
continue
jsonpath_expression = jsonpath_rw.parse(json_path)
values = [match.value for match in jsonpath_expression.find(json)]
series[json_path] = values
expr = jsonpath_rw.parse(json_path)
vals = []
for match in expr.find(json_obj):
v = match.value
# If we got back a one‐item list, unwrap it.
if isinstance(v, list) and len(v) == 1:
v = v[0]
vals.append(v)
series[json_path] = vals
return pd.DataFrame(series)


class JsonReader:
"""Parser for JSON files, supporting both JSONPath and JMESPath expressions."""

def __init__(self, fields: tuple[Field, ...]):
"""Initializes the parser with a tuple of Field objects.
Args:
fields (tuple[Field, ...]): A tuple of Field objects, each containing
a source with a JSON path to extract.
The constructor builds a list of tuples for each field with a valid
JSON path:
- The original JSON path string.
- The engine used for evaluation ("jsonpath" for recursive-descent
paths, "jmespath" for simple direct paths).
- The compiled expression object for efficient evaluation.
Fields without a JSON path are skipped.
"""
import jmespath

# Build a list of (original_jsonpath, engine, compiled_expr).
self.exprs: list[tuple[str, str, Any]] = []
for field in fields:
json_path = field.source.extract.json_path
if not json_path:
continue

# Decide whether this path can be JMESPath or needs full JSONPath.
stripped = json_path.lstrip("$.")
if ".." in json_path:
# Uses recursive‐descent → fall back to jsonpath_ng.
expr = jsonpath_rw.parse(json_path)
engine = "jsonpath"
else:
# Simple direct path → use JMESPath.
expr = jmespath.compile(stripped)
engine = "jmespath"

self.exprs.append((json_path, engine, expr))
self.fields = fields

def parse(self, fh: TextIO) -> pd.DataFrame:
"""Parses a JSON file-like object and extracts data into a pandas DataFrame.
Args:
fh (TextIO): A file-like object containing JSON data.
Returns:
pd.DataFrame: DataFrame with extracted data,
where each column corresponds to an expression.
"""
import orjson

# Load entire JSON file (could be a list or a single dict).
raw = fh.read()
data = orjson.loads(raw)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can see here an example of how to lazily load a library: 4fbd358


# Always treat as list of records.
records = data if isinstance(data, list) else [data]

series: dict[str, list] = {}
for jp, engine, expr in self.exprs:
vals: list = []
for rec in records:
if engine == "jmespath":
out = expr.search(rec)
# Unwrap single‐item lists.
out = _unwrap_single_item(out)
else: # Engine jsonpath_ng.
matches = expr.find(rec)
out = [m.value for m in matches]
out = _unwrap_single_item(out)

vals.append(out)

series[jp] = vals

return pd.DataFrame(series)

def raw(self, fh: TextIO) -> pd.DataFrame:
"""Reads a JSON file-like object and returns a single-cell pandas DataFrame.
The entire JSON content is loaded and placed in a DataFrame with one row
and one column, where the column name is specified by `FileProperty.content`.
Args:
fh (TextIO): A file-like object opened for reading JSON data.
Returns:
pd.DataFrame: A DataFrame containing the JSON content in a single cell.
"""
# Raw JSON fallback: one‐cell DataFrame.
raw = fh.read()
content = orjson.loads(raw)
return pd.DataFrame({FileProperty.content: [content]})


class JsonlReader:
"""Parser for JSON Lines files, supporting both JSONPath and JMESPath."""

def __init__(self, fields):
"""Initializes the parser with a list of fields.
Args:
fields (list): A list of field objects, each expected to have a
`source.extract.json_path` attribute.
The constructor processes each field's JSON path:
- If the path is a simple JSONPath (starts with "$." and does not
contain ".."), it is converted to a JMESPath expression and
compiled.
- Otherwise, the path is parsed and compiled using jsonpath_rw.
Compiled expressions, along with their original paths and the engine
used, are stored in `self.exprs`. The original list of fields is stored
in `self.fields`.
"""
import jmespath

self.exprs = [] # list of (orig_path, engine, compiled_expr)
for field in fields:
json_path = field.source.extract.json_path
if not json_path:
continue

if json_path.startswith("$.") and ".." not in json_path:
# simple JSONPath → JMESPath
jm = json_path.lstrip("$.") # drop the "$."
expr = jmespath.compile(jm)
engine = "jmespath"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OoC, why do we need both JSONPath and JMESPath? Why not use JSONPath everywhere?

else:
# anything with recursive‐descent or complex filters
expr = jsonpath_rw.parse(json_path)
engine = "jsonpath"

self.exprs.append((json_path, engine, expr))
self.fields = fields

def parse(self, fh):
"""Parses a file-like object containing JSON objects (one per line).
Args:
fh: A file-like object to read from, where each line is a JSON object.
Returns:
pd.DataFrame: A DataFrame where each row corresponds to a parsed
JSON object with extracted fields.
Notes:
- The extraction expressions are defined in self.exprs as tuples of
(json_path, engine, expr).
- For JMESPath, single-item lists are unwrapped to their value.
- For JSONPath, values are extracted from Match objects and
single-item lists are unwrapped.
"""
import orjson

rows = []
for line in fh:
line = line.strip()
if not line:
continue
rec = orjson.loads(line)
row: dict[str, object] = {}
for json_path, engine, expr in self.exprs:
if engine == "jmespath":
out = expr.search(rec)
# Unwrap single‐item lists.
out = _unwrap_single_item(out)
else:
matches = expr.find(rec)
temp = [m.value for m in matches]
# Unwrap single‐item lists.
out = _unwrap_single_item(temp)
row[json_path] = out
rows.append(row)
return pd.DataFrame(rows)

def raw(self, fh: TextIO) -> pd.DataFrame:
"""Reads a JSON Lines file-like object and returns a DataFrame."""
fh.seek(0)
return pd.read_json(fh, lines=True)
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
"""parse_json_test module."""

import io
import json

import pandas as pd

from mlcroissant._src.operation_graph.operations.parse_json import JsonlReader
from mlcroissant._src.operation_graph.operations.parse_json import JsonReader
from mlcroissant._src.operation_graph.operations.parse_json import parse_json_content
from mlcroissant._src.structure_graph.nodes.source import Extract
from mlcroissant._src.structure_graph.nodes.source import Source
Expand All @@ -27,3 +32,77 @@ def test_parse_json():
data={"$.annotations[*].id": [1, 2], "$.annotations[*].value": [3, 4]}
)
pd.testing.assert_frame_equal(parse_json_content(json, fields), expected_df)


def test_jsonreader_parse():
# JsonReader.parse should extract values according to JSONPath
field = create_test_field(
source=Source(extract=Extract(json_path="$.item[*].value"))
)
fields = (field,)
data = [{"item": [{"value": 10}]}, {"item": [{"value": 20}, {"value": 30}]}]
raw_str = json.dumps(data)
fh = io.StringIO(raw_str)
reader = JsonReader(fields=fields)
df = reader.parse(fh)
expected = pd.DataFrame({"$.item[*].value": [10, [20, 30]]})
pd.testing.assert_frame_equal(df, expected)


def test_jsonreader_parse_deep():
import orjson

# Test nested JSONPath ($.level1.level2[*].value)
field = create_test_field(
source=Source(extract=Extract(json_path="$.level1.level2[*].value"))
)
fields = (field,)
json_obj = {"level1": {"level2": [{"value": 100}, {"value": 200}]}}
expected_df = pd.DataFrame({"$.level1.level2[*].value": [[100, 200]]})
raw_str = orjson.dumps(json_obj).decode("utf-8")
fh = io.StringIO(raw_str)
reader = JsonReader(fields=fields)
df = reader.parse(fh)
pd.testing.assert_frame_equal(df, expected_df)


def test_jsonlreader_raw():
# JsonlReader.raw should read JSON Lines into a DataFrame
lines = [{"a": 1}, {"a": 2}]
raw_text = "\n".join(json.dumps(rec) for rec in lines)
fh = io.StringIO(raw_text)
reader = JsonlReader(fields=())
df = reader.raw(fh)
expected = pd.DataFrame(lines)
pd.testing.assert_frame_equal(df, expected)


def test_jsonlreader_parse():
# JsonlReader.parse should extract values across lines
field = create_test_field(source=Source(extract=Extract(json_path="$.x")))
fields = (field,)
lines = [{"x": 5}, {"x": 6}]
raw_text = "\n".join(json.dumps(rec) for rec in lines)
fh = io.StringIO(raw_text)
reader = JsonlReader(fields=fields)
df = reader.parse(fh)
expected = pd.DataFrame({"$.x": [5, 6]})
pd.testing.assert_frame_equal(df, expected)


def test_jsonlreader_deeper_path():
# JsonlReader.parse should handle nested deeper JSONPath
field = create_test_field(
source=Source(extract=Extract(json_path="$.meta.detail[*].info"))
)
fields = (field,)
records = [
{"meta": {"detail": [{"info": "a"}, {"info": "b"}]}},
{"meta": {"detail": [{"info": "c"}]}},
]
raw_text = "\n".join(json.dumps(rec) for rec in records)
fh = io.StringIO(raw_text)
reader = JsonlReader(fields=fields)
df = reader.parse(fh)
expected = pd.DataFrame({"$.meta.detail[*].info": [["a", "b"], "c"]})
pd.testing.assert_frame_equal(df, expected)
Loading
Loading