mlcommons · crisely09 · May 28, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
@@ -0,0 +1,10 @@
+{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "1", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "0QH6R84NZEVZ6FD87G94UDQ1NT1HWK", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "2", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "DA8DV5VNC520V4AW0DD4PY0TVFJLXG", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-06-12", "serviceRequest/identifier_value": "3", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "9PAXV9DHENCMAL0MD9WCLGF6DUALRZ", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-06-14", "serviceRequest/identifier_value": "4", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Magugu Health Centre", "serviceRequest/requester_reference": "6", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "RV98DJ47NE093WQZYUNYR5MKD8RAL2", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-06-20", "serviceRequest/identifier_value": "5", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Dareda Hospital", "serviceRequest/requester_reference": "3", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "YYJM3EF040EDD1Z3Q4DE3RPEXCM9G9", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "6", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "5K8JQWC7DM4X2RJM3XYQWRU9EJ8V8P", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "7", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "NURM0TRUZV8MC8WFQZPUDUC7PLR2ER", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "8", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Bashnet Hospital", "serviceRequest/requester_reference": "4", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "8H5N62X4V95G8Q2THPD6X08Y745MAY", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "9", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "HYTGMF1Z301UJZE3J3ULPN4PYG5VLL", "serviceRequest/subject_type": "Patient"}
+{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "10", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "P27Q2AK2U1FDC1455WWHC32PLY763V", "serviceRequest/subject_type": "Patient"}
@@ -222,6 +222,7 @@ class EncodingFormat:
     JPG = "image/jpeg"
     JSON = "application/json"
     JSON_LINES = "application/jsonlines"
+    FHIR = "application/fhir+json"
     MP3 = "audio/mpeg"
     PARQUET = "application/x-parquet"
     TEXT = "text/plain"

@@ -1,20 +1,223 @@
 """Parse JSON operation."""
 
+from typing import Any, TextIO
+
 import jsonpath_rw
+import orjson
 import pandas as pd
 
-from mlcroissant._src.core.types import Json
 from mlcroissant._src.structure_graph.nodes.field import Field
+from mlcroissant._src.structure_graph.nodes.source import FileProperty
+
+
+def _unwrap_single_item(value: Any) -> Any:
+    """Unwraps a single-item list to its value, or returns the value as is."""
+    if isinstance(value, list) and len(value) == 1:
+        if value[0] is None:
+            return None
+        return value[0]
+    return value
 
 
-def parse_json_content(json: Json, fields: tuple[Field, ...]) -> pd.DataFrame:
+def parse_json_content(json_obj, fields):
     """Parsed all JSONs defined in the fields of RecordSet and outputs a pd.DF."""
     series = {}
     for field in fields:
         json_path = field.source.extract.json_path
-        if json_path is None:
+        if not json_path:
             continue
-        jsonpath_expression = jsonpath_rw.parse(json_path)
-        values = [match.value for match in jsonpath_expression.find(json)]
-        series[json_path] = values
+        expr = jsonpath_rw.parse(json_path)
+        vals = []
+        for match in expr.find(json_obj):
+            v = match.value
+            # If we got back a one‐item list, unwrap it.
+            if isinstance(v, list) and len(v) == 1:
+                v = v[0]
+            vals.append(v)
+        series[json_path] = vals
     return pd.DataFrame(series)
+
+
+class JsonReader:
+    """Parser for JSON files, supporting both JSONPath and JMESPath expressions."""
+
+    def __init__(self, fields: tuple[Field, ...]):
+        """Initializes the parser with a tuple of Field objects.
+
+        Args:
+            fields (tuple[Field, ...]): A tuple of Field objects, each containing
+            a source with a JSON path to extract.
+
+        The constructor builds a list of tuples for each field with a valid
+        JSON path:
+            - The original JSON path string.
+            - The engine used for evaluation ("jsonpath" for recursive-descent
+              paths, "jmespath" for simple direct paths).
+            - The compiled expression object for efficient evaluation.
+
+        Fields without a JSON path are skipped.
+        """
+        import jmespath
+
+        # Build a list of (original_jsonpath, engine, compiled_expr).
+        self.exprs: list[tuple[str, str, Any]] = []
+        for field in fields:
+            json_path = field.source.extract.json_path
+            if not json_path:
+                continue
+
+            # Decide whether this path can be JMESPath or needs full JSONPath.
+            stripped = json_path.lstrip("$.")
+            if ".." in json_path:
+                # Uses recursive‐descent → fall back to jsonpath_ng.
+                expr = jsonpath_rw.parse(json_path)
+                engine = "jsonpath"
+            else:
+                # Simple direct path → use JMESPath.
+                expr = jmespath.compile(stripped)
+                engine = "jmespath"
+
+            self.exprs.append((json_path, engine, expr))
+        self.fields = fields
+
+    def parse(self, fh: TextIO) -> pd.DataFrame:
+        """Parses a JSON file-like object and extracts data into a pandas DataFrame.
+
+        Args:
+            fh (TextIO): A file-like object containing JSON data.
+
+        Returns:
+            pd.DataFrame: DataFrame with extracted data,
+            where each column corresponds to an expression.
+        """
+        import orjson
+
+        # Load entire JSON file (could be a list or a single dict).
+        raw = fh.read()
+        data = orjson.loads(raw)
+
+        # Always treat as list of records.
+        records = data if isinstance(data, list) else [data]
+
+        series: dict[str, list] = {}
+        for jp, engine, expr in self.exprs:
+            vals: list = []
+            for rec in records:
+                if engine == "jmespath":
+                    out = expr.search(rec)
+                    # Unwrap single‐item lists.
+                    out = _unwrap_single_item(out)
+                else:  # Engine jsonpath_ng.
+                    matches = expr.find(rec)
+                    out = [m.value for m in matches]
+                    out = _unwrap_single_item(out)
+
+                vals.append(out)
+
+            series[jp] = vals
+
+        return pd.DataFrame(series)
+
+    def raw(self, fh: TextIO) -> pd.DataFrame:
+        """Reads a JSON file-like object and returns a single-cell pandas DataFrame.
+
+        The entire JSON content is loaded and placed in a DataFrame with one row
+        and one column, where the column name is specified by `FileProperty.content`.
+
+        Args:
+            fh (TextIO): A file-like object opened for reading JSON data.
+
+        Returns:
+            pd.DataFrame: A DataFrame containing the JSON content in a single cell.
+        """
+        # Raw JSON fallback: one‐cell DataFrame.
+        raw = fh.read()
+        content = orjson.loads(raw)
+        return pd.DataFrame({FileProperty.content: [content]})
+
+
+class JsonlReader:
+    """Parser for JSON Lines files, supporting both JSONPath and JMESPath."""
+
+    def __init__(self, fields):
+        """Initializes the parser with a list of fields.
+
+        Args:
+            fields (list): A list of field objects, each expected to have a
+            `source.extract.json_path` attribute.
+
+        The constructor processes each field's JSON path:
+            - If the path is a simple JSONPath (starts with "$." and does not
+              contain ".."), it is converted to a JMESPath expression and
+              compiled.
+            - Otherwise, the path is parsed and compiled using jsonpath_rw.
+
+        Compiled expressions, along with their original paths and the engine
+        used, are stored in `self.exprs`. The original list of fields is stored
+        in `self.fields`.
+        """
+        import jmespath
+
+        self.exprs = []  # list of (orig_path, engine, compiled_expr)
+        for field in fields:
+            json_path = field.source.extract.json_path
+            if not json_path:
+                continue
+
+            if json_path.startswith("$.") and ".." not in json_path:
+                # simple JSONPath → JMESPath
+                jm = json_path.lstrip("$.")  # drop the "$."
+                expr = jmespath.compile(jm)
+                engine = "jmespath"
+            else:
+                # anything with recursive‐descent or complex filters
+                expr = jsonpath_rw.parse(json_path)
+                engine = "jsonpath"
+
+            self.exprs.append((json_path, engine, expr))
+        self.fields = fields
+
+    def parse(self, fh):
+        """Parses a file-like object containing JSON objects (one per line).
+
+        Args:
+            fh: A file-like object to read from, where each line is a JSON object.
+
+        Returns:
+            pd.DataFrame: A DataFrame where each row corresponds to a parsed
+            JSON object with extracted fields.
+
+        Notes:
+            - The extraction expressions are defined in self.exprs as tuples of
+            (json_path, engine, expr).
+            - For JMESPath, single-item lists are unwrapped to their value.
+            - For JSONPath, values are extracted from Match objects and
+            single-item lists are unwrapped.
+        """
+        import orjson
+
+        rows = []
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            rec = orjson.loads(line)
+            row: dict[str, object] = {}
+            for json_path, engine, expr in self.exprs:
+                if engine == "jmespath":
+                    out = expr.search(rec)
+                    # Unwrap single‐item lists.
+                    out = _unwrap_single_item(out)
+                else:
+                    matches = expr.find(rec)
+                    temp = [m.value for m in matches]
+                    # Unwrap single‐item lists.
+                    out = _unwrap_single_item(temp)
+                row[json_path] = out
+            rows.append(row)
+        return pd.DataFrame(rows)
+
+    def raw(self, fh: TextIO) -> pd.DataFrame:
+        """Reads a JSON Lines file-like object and returns a DataFrame."""
+        fh.seek(0)
+        return pd.read_json(fh, lines=True)
@@ -1,7 +1,12 @@
 """parse_json_test module."""
 
+import io
+import json
+
 import pandas as pd
 
+from mlcroissant._src.operation_graph.operations.parse_json import JsonlReader
+from mlcroissant._src.operation_graph.operations.parse_json import JsonReader
 from mlcroissant._src.operation_graph.operations.parse_json import parse_json_content
 from mlcroissant._src.structure_graph.nodes.source import Extract
 from mlcroissant._src.structure_graph.nodes.source import Source
@@ -27,3 +32,77 @@ def test_parse_json():
         data={"$.annotations[*].id": [1, 2], "$.annotations[*].value": [3, 4]}
     )
     pd.testing.assert_frame_equal(parse_json_content(json, fields), expected_df)
+
+
+def test_jsonreader_parse():
+    # JsonReader.parse should extract values according to JSONPath
+    field = create_test_field(
+        source=Source(extract=Extract(json_path="$.item[*].value"))
+    )
+    fields = (field,)
+    data = [{"item": [{"value": 10}]}, {"item": [{"value": 20}, {"value": 30}]}]
+    raw_str = json.dumps(data)
+    fh = io.StringIO(raw_str)
+    reader = JsonReader(fields=fields)
+    df = reader.parse(fh)
+    expected = pd.DataFrame({"$.item[*].value": [10, [20, 30]]})
+    pd.testing.assert_frame_equal(df, expected)
+
+
+def test_jsonreader_parse_deep():
+    import orjson
+
+    # Test nested JSONPath ($.level1.level2[*].value)
+    field = create_test_field(
+        source=Source(extract=Extract(json_path="$.level1.level2[*].value"))
+    )
+    fields = (field,)
+    json_obj = {"level1": {"level2": [{"value": 100}, {"value": 200}]}}
+    expected_df = pd.DataFrame({"$.level1.level2[*].value": [[100, 200]]})
+    raw_str = orjson.dumps(json_obj).decode("utf-8")
+    fh = io.StringIO(raw_str)
+    reader = JsonReader(fields=fields)
+    df = reader.parse(fh)
+    pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_jsonlreader_raw():
+    # JsonlReader.raw should read JSON Lines into a DataFrame
+    lines = [{"a": 1}, {"a": 2}]
+    raw_text = "\n".join(json.dumps(rec) for rec in lines)
+    fh = io.StringIO(raw_text)
+    reader = JsonlReader(fields=())
+    df = reader.raw(fh)
+    expected = pd.DataFrame(lines)
+    pd.testing.assert_frame_equal(df, expected)
+
+
+def test_jsonlreader_parse():
+    # JsonlReader.parse should extract values across lines
+    field = create_test_field(source=Source(extract=Extract(json_path="$.x")))
+    fields = (field,)
+    lines = [{"x": 5}, {"x": 6}]
+    raw_text = "\n".join(json.dumps(rec) for rec in lines)
+    fh = io.StringIO(raw_text)
+    reader = JsonlReader(fields=fields)
+    df = reader.parse(fh)
+    expected = pd.DataFrame({"$.x": [5, 6]})
+    pd.testing.assert_frame_equal(df, expected)
+
+
+def test_jsonlreader_deeper_path():
+    # JsonlReader.parse should handle nested deeper JSONPath
+    field = create_test_field(
+        source=Source(extract=Extract(json_path="$.meta.detail[*].info"))
+    )
+    fields = (field,)
+    records = [
+        {"meta": {"detail": [{"info": "a"}, {"info": "b"}]}},
+        {"meta": {"detail": [{"info": "c"}]}},
+    ]
+    raw_text = "\n".join(json.dumps(rec) for rec in records)
+    fh = io.StringIO(raw_text)
+    reader = JsonlReader(fields=fields)
+    df = reader.parse(fh)
+    expected = pd.DataFrame({"$.meta.detail[*].info": [["a", "b"], "c"]})
+    pd.testing.assert_frame_equal(df, expected)