diff --git a/datasets/1.1/pharmaccess-momcare-fhir/metadata.json b/datasets/1.1/pharmaccess-momcare-fhir/metadata.json new file mode 100644 index 000000000..9b98cf5f7 --- /dev/null +++ b/datasets/1.1/pharmaccess-momcare-fhir/metadata.json @@ -0,0 +1,610 @@ +{ + "@context": { + "Audience": { + "@id": "dv:Audience" + }, + "Boolean": { + "@id": "schema:Boolean" + }, + "CreativeWork": { + "@id": "schema:CreativeWork" + }, + "Date": { + "@id": "schema:Date" + }, + "DataDownload": { + "@id": "schema:DataDownload" + }, + "Dataset": { + "@id": "schema:Dataset" + }, + "DateTime": { + "@id": "schema:DateTime" + }, + "DescriptiveStatistics": { + "@id": "dv:DescriptiveStatistics" + }, + "Duration": { + "@id": "schema:Duration" + }, + "ExperimentDataset": { + "@id": "dv:ExperimentDataset", + "@type": "schema:Dataset" + }, + "Field": { + "@id": "cr:Field" + }, + "FileObject": { + "@id": "cr:FileObject" + }, + "Float": { + "@id": "schema:Float" + }, + "HowTo": { + "@id": "schema:HowTo" + }, + "HowToSection": { + "@id": "schema:HowToSection" + }, + "HowToStep": { + "@id": "schema:HowToStep" + }, + "Integer": { + "@id": "schema:Integer" + }, + "Location": { + "@id": "schema:Location" + }, + "Number": { + "@id": "schema:Number" + }, + "OpenDataArticle": { + "@id": "dv:OpenDataArticle", + "@type": "schema:ScholarlyArticle" + }, + "OpenDataArticleSection": { + "@id": "dv:OpenDataArticleSection", + "@type": "schema:Text" + }, + "Organization": { + "@id": "schema:Organization" + }, + "Person": { + "@id": "schema:Person" + }, + "Rating": { + "@id": "schema:Rating" + }, + "RecordSet": { + "@id": "cr:RecordSet" + }, + "Role": { + "@id": "dv:Role" + }, + "ScholarlyArticle": { + "@id": "schema:ScholarlyArticle" + }, + "Step": { + "@id": "dv:Step" + }, + "Submission": { + "@id": "dv:Submission" + }, + "Substep": { + "@id": "dv:Substep" + }, + "Section": { + "@id": "dv:Section" + }, + "Text": { + "@id": "schema:Text" + }, + "Visualization": { + "@id": "dv:Visualization" + }, + "VisualizationDataset": { + "@id": "dv:VisualizationDataset" + }, + "activities": { + "@id": "dv:activities" + }, + "affiliation": { + "@id": "schema:affiliation" + }, + "algorithm": { + "@id": "schema:algorithm" + }, + "annotationsPerItem": { + "@id": "rai:annotationsPerItem" + }, + "annotatorDemographics": { + "@id": "rai:annotatorDemographics" + }, + "atLocation": { + "@id": "schema:atLocation" + }, + "attachment": { + "@id": "dv:attachment" + }, + "author": { + "@id": "schema:author" + }, + "builds": { + "@id": "dv:builds" + }, + "citation": { + "@id": "schema:citation" + }, + "citeAs": { + "@id": "cr:citeAs" + }, + "column": { + "@id": "cr:column" + }, + "conformsTo": { + "@id": "dct:conformsTo" + }, + "contentSize": { + "@id": "schema:contentSize" + }, + "contentUrl": { + "@id": "schema:contentUrl" + }, + "contributor": { + "@id": "schema:contributor" + }, + "contributorRole": { + "@id": "dv:contributorRole" + }, + "cr": "http://mlcommons.org/croissant/", + "credit": "http://www.niso.org/publications/z39104-2022-credit#", + "dataAnnotationAnalysis": { + "@id": "rai:dataAnnotationAnalysis" + }, + "dataAnnotationPlatform": { + "@id": "rai:dataAnnotationPlatform" + }, + "dataAnnotationProtocol": { + "@id": "rai:dataAnnotationProtocol" + }, + "dataBiases": { + "@id": "rai:dataBiases" + }, + "dataCollection": { + "@id": "rai:dataCollection" + }, + "dataCollectionMissingData": { + "@id": "rai:dataCollectionMissingData" + }, + "dataCollectionRawData": { + "@id": "rai:dataCollectionRawData" + }, + "dataCollectionTimeframe": { + "@id": "rai:dataCollectionTimeframe" + }, + "dataCollectionType": { + "@id": "rai:dataCollectionType" + }, + "dataImputationProtocol": { + "@id": "rai:dataImputationProtocol" + }, + "dataLimitations": { + "@id": "rai:dataLimitations" + }, + "dataManipulationProtocol": { + "@id": "rai:dataManipulationProtocol" + }, + "dataPreprocessingProtocol": { + "@id": "rai:dataPreprocessingProtocol" + }, + "dataReleaseMaintenancePlan": { + "@id": "rai:dataReleaseMaintenancePlan" + }, + "dataSocialImpact": { + "@id": "rai:dataSocialImpact" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dataUseCases": { + "@id": "rai:dataUseCases" + }, + "dataset": { + "@id": "dv:dataset" + }, + "datePublished": { + "@id": "schema:datePublished" + }, + "dct": "http://purl.org/dc/terms/", + "description": { + "@id": "schema:description" + }, + "digest": { + "@id": "dv:digest" + }, + "distribution": { + "@id": "schema:distribution" + }, + "dv": "http://senscience.ai/", + "encoding": { + "@id": "schema:encoding" + }, + "encodingFormat": { + "@id": "schema:encodingFormat" + }, + "extract": { + "@id": "cr:extract" + }, + "field": { + "@id": "cr:field" + }, + "fileObject": { + "@id": "cr:fileObject" + }, + "funding": { + "@id": "schema:funding" + }, + "hasPart": { + "@id": "schema:hasPart" + }, + "isBasedOn": { + "@id": "schema:isBasedOn" + }, + "jsonPath": { + "@id": "cr:jsonPath" + }, + "license": { + "@id": "schema:license" + }, + "machineAnnotationTools": { + "@id": "rai:machineAnnotationTools" + }, + "manuscript": { + "@id": "dv:manuscript" + }, + "name": { + "@id": "schema:name" + }, + "personalSensitiveInformation": { + "@id": "rai:personalSensitiveInformation" + }, + "prov": "http://www.w3.org/ns/prov#", + "rai": "http://mlcommons.org/croissant/RAI/", + "recordSet": { + "@id": "cr:recordSet" + }, + "roleName": { + "@id": "dv:roleName" + }, + "schema": "https://schema.org/", + "sh": "http://www.w3.org/ns/shacl#", + "sha256": { + "@id": "schema:sha256" + }, + "source": { + "@id": "cr:source" + }, + "statistics": { + "@id": "dv:statistics" + }, + "steps": { + "@id": "dv:steps" + }, + "store": { + "@id": "dv:store" + }, + "unitCode": { + "@id": "schema:unitCode" + }, + "value": { + "@id": "schema:value" + }, + "variables": { + "@id": "dv:variables", + "@type": "schema:DefinedTermSet" + }, + "visualization": { + "@id": "dv:visualization" + }, + "version": { + "@id": "schema:version" + } + }, + "@type": "schema:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "MomCare_dataset_FHIR_patient_serviceRequest", + "description": "This dataset is part of the MomCare dataset, that captures maternal health journeys from the MomCare program in Tanzania, structured as HL7 FHIR v4 resources. It includes over 430,000 records across nine resource types—Patient, Observation, Condition, EpisodeOfCare, Location, Organization, Questionnaire, QuestionnaireResponse, and ServiceRequest. Extracted from a relational point-of-service system and transformed using SQL-based methods, the dataset models enrollment, antenatal care, diagnoses, risk profiling, and follow-up across more than 70 clinics. All data are pseudonymized, machine-actionable, and exported in NDJSON format for interoperability, analytics, and AI integration.", + "recordSet": [ + { + "@id": "serviceRequest", + "@type": "cr:RecordSet", + "name": "Servicerequest", + "description": "Automatically extracted fields from serviceRequest.ndjson", + "field": [ + { + "@id": "serviceRequest/authoredOn", + "@type": "cr:Field", + "name": "authoredOn", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.authoredOn" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/identifier_value", + "@type": "cr:Field", + "name": "value", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.identifier[*].value" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/intent", + "@type": "cr:Field", + "name": "intent", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.intent" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/performer_display", + "@type": "cr:Field", + "name": "display", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.performer[*].display" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/performer_reference", + "@type": "cr:Field", + "name": "reference", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.performer[*].reference" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/performer_type", + "@type": "cr:Field", + "name": "type", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.performer[*].type" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/reasonCode_text", + "@type": "cr:Field", + "name": "text", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.reasonCode[*].text" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/requester_display", + "@type": "cr:Field", + "name": "display", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.requester.display" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/requester_reference", + "@type": "cr:Field", + "name": "reference", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.requester.reference" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/requester_type", + "@type": "cr:Field", + "name": "type", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.requester.type" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/resourceType", + "@type": "cr:Field", + "name": "resourceType", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.resourceType" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/status", + "@type": "cr:Field", + "name": "status", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.status" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/subject_reference", + "@type": "cr:Field", + "name": "reference", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.subject.reference" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + }, + { + "@id": "serviceRequest/subject_type", + "@type": "cr:Field", + "name": "type", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.subject.type" + }, + "fileObject": { + "@id": "resources/serviceRequest" + } + } + } + ] + }, + { + "@id": "patient", + "@type": "cr:RecordSet", + "name": "Patient", + "description": "Automatically extracted fields from patient.ndjson", + "field": [ + { + "@id": "patient/active", + "@type": "cr:Field", + "name": "active", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.active" + }, + "fileObject": { + "@id": "resources/patient" + } + } + }, + { + "@id": "patient/birthDate", + "@type": "cr:Field", + "name": "birthDate", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.birthDate" + }, + "fileObject": { + "@id": "resources/patient" + } + } + }, + { + "@id": "patient/gender", + "@type": "cr:Field", + "name": "gender", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.gender" + }, + "fileObject": { + "@id": "resources/patient" + } + } + }, + { + "@id": "patient/resourceType", + "@type": "cr:Field", + "name": "resourceType", + "dataType": "schema:Text", + "source": { + "extract": { + "jsonPath": "$.resourceType" + }, + "fileObject": { + "@id": "resources/patient" + } + } + } + ] + } + ], + "distribution": [ + { + "@id": "resources/patient", + "@type": "cr:FileObject", + "contentSize": "13259393", + "contentUrl": "https://storage.googleapis.com/hanang-anonymized-maternal-care-data/patient.ndjson", + "description": "Raw FHIR patient resource exported as NDJSON.", + "encodingFormat": "application/fhir+json", + "name": "patient.ndjson", + "sha256": "e208bfb1e4b93750f48c6e406f94f9ead7e3e7d6b5b267d6077785fec5de0986" + }, + { + "@id": "resources/serviceRequest", + "@type": "cr:FileObject", + "contentSize": "190572", + "contentUrl": "https://storage.googleapis.com/hanang-anonymized-maternal-care-data/serviceRequest.ndjson", + "description": "Raw FHIR serviceRequest resource exported as NDJSON.", + "encodingFormat": "application/fhir+json", + "name": "serviceRequest.ndjson", + "sha256": "da4aaffd4f57b9717a9984399bce85d764e8dc5f3b1ad5bea96acef4d4673b2d" + } + ] +} diff --git a/datasets/1.1/pharmaccess-momcare-fhir/output/serviceRequest.jsonl b/datasets/1.1/pharmaccess-momcare-fhir/output/serviceRequest.jsonl new file mode 100644 index 000000000..7cf786de3 --- /dev/null +++ b/datasets/1.1/pharmaccess-momcare-fhir/output/serviceRequest.jsonl @@ -0,0 +1,10 @@ +{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "1", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "0QH6R84NZEVZ6FD87G94UDQ1NT1HWK", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "2", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "DA8DV5VNC520V4AW0DD4PY0TVFJLXG", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-06-12", "serviceRequest/identifier_value": "3", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "9PAXV9DHENCMAL0MD9WCLGF6DUALRZ", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-06-14", "serviceRequest/identifier_value": "4", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Magugu Health Centre", "serviceRequest/requester_reference": "6", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "RV98DJ47NE093WQZYUNYR5MKD8RAL2", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-06-20", "serviceRequest/identifier_value": "5", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Dareda Hospital", "serviceRequest/requester_reference": "3", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "YYJM3EF040EDD1Z3Q4DE3RPEXCM9G9", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "6", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "5K8JQWC7DM4X2RJM3XYQWRU9EJ8V8P", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "7", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "NURM0TRUZV8MC8WFQZPUDUC7PLR2ER", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "8", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Bashnet Hospital", "serviceRequest/requester_reference": "4", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "8H5N62X4V95G8Q2THPD6X08Y745MAY", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "9", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "HYTGMF1Z301UJZE3J3ULPN4PYG5VLL", "serviceRequest/subject_type": "Patient"} +{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "10", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "P27Q2AK2U1FDC1455WWHC32PLY763V", "serviceRequest/subject_type": "Patient"} diff --git a/python/mlcroissant/mlcroissant/_src/core/constants.py b/python/mlcroissant/mlcroissant/_src/core/constants.py index 19a4145aa..5bf3f3457 100644 --- a/python/mlcroissant/mlcroissant/_src/core/constants.py +++ b/python/mlcroissant/mlcroissant/_src/core/constants.py @@ -222,6 +222,7 @@ class EncodingFormat: JPG = "image/jpeg" JSON = "application/json" JSON_LINES = "application/jsonlines" + FHIR = "application/fhir+json" MP3 = "audio/mpeg" PARQUET = "application/x-parquet" TEXT = "text/plain" diff --git a/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box.py b/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box.py index 700444901..47ddcc3c9 100644 --- a/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box.py +++ b/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box.py @@ -1,38 +1,90 @@ """Module to manage "bounding boxes" annotations on images.""" -from typing import Any +from typing import Any, List, Union -def parse(value: Any) -> list[float]: - """Parses a value to a machine-readable bounding box. +def _parse_one(value: Union[str, List[Any]]) -> List[float]: + """Parse a single bounding box representation into a list of four floats.""" + processed_value = [] + if isinstance(value, str): + processed_value = value.split() + elif isinstance(value, list): + processed_value = value - Args: - value: The value to parse can be either a single space-separated string or a - list of float-compatible elements. - - Returns: - The 4-float list that composes the bounding box. - """ - if isinstance(value, list): - pass - elif isinstance(value, str): - value = value.split(" ") - else: + if len(processed_value) != 4: raise ValueError( - "Wrong format for a bounding box. Expected: str | list. Got:" - f" {type(value)}. If you need to support more format, feel free to create" - " an issue on GitHub." + f"Input should have a length of 4, but has length {len(processed_value)}." ) + try: - value = [float(element) for element in value] + return [float(coord) for coord in processed_value] except ValueError as e: raise ValueError( - "Bounding boxes should have coordinates that can be converted to floats." - f" Got {value}" + "All bounding box coordinates can be converted to floats. " + f"Got: {processed_value}" ) from e - if len(value) != 4: + + +def _parse_all(value: List) -> List[List[float]]: + """Parse a list containing multiple bounding boxes.""" + # Case 1: List of lists, e.g., [[box1], [box2]] + if isinstance(value[0], list): + return [_parse_one(item) for item in value] + + # Case 2: Flat list, e.g., [x1, y1, w1, h1, x2, y2, w2, h2] + # This case is handled by the main parse function's dispatch logic. + # We chunk the flat list into a list of 4-element lists. + try: + coords = [float(v) for v in value] + return [coords[i : i + 4] for i in range(0, len(coords), 4)] + except ValueError as e: raise ValueError( - "Bounding box could not be parsed. Bounding boxes should have a length of" - f" 4. Got {len(value)}" - ) - return value + f"All bounding box coordinates can be converted to floats. Got: {value}" + ) from e + + +def parse(value: Any) -> Union[List[float], List[List[float]]]: + """Parse a value into one or more bounding boxes. + + The return type depends on the input: + - A single bounding box returns a List[float]. + - Multiple bounding boxes returns a List[List[float]]. + + Args: + value: The value to parse. Can be a string, a list of 4 elements, + a list of lists, or a flat list of 4*N elements. + + Returns: + A list of four floats, or a list of such lists. + + Raises: + ValueError: If the input format is invalid. + """ + if isinstance(value, str): + return _parse_one(value) + + if isinstance(value, list): + if not value: + return [] + + # Decide if we're parsing one or multiple boxes. + if isinstance(value[0], list): + # A list of lists is always multiple bounding boxes. + return _parse_all(value) + else: + # A flat list. Check length to decide. + if len(value) % 4 == 0: + if len(value) == 4: + # A list of 4 items is a single bounding box. + return _parse_one(value) + else: + # A list of 4*N items is multiple bounding boxes. + return _parse_all(value) + + # If the input is not a string or a list, or if it's a list with + # an invalid length (e.g., 5), we let _parse_one raise the + # appropriate, specific error. + if isinstance(value, list) and len(value) != 4: + return _parse_one(value) + + raise ValueError(f"Wrong format. Expected str or list, but got {type(value)}.") diff --git a/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box_test.py b/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box_test.py index 9005ddb79..dc1ff0144 100644 --- a/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box_test.py +++ b/python/mlcroissant/mlcroissant/_src/core/ml/bounding_box_test.py @@ -9,6 +9,14 @@ def test_parse(): assert bounding_box.parse([1, 2, 3, 4]) == [1.0, 2.0, 3.0, 4.0] assert bounding_box.parse("1 2 3 4") == [1.0, 2.0, 3.0, 4.0] assert bounding_box.parse("1.0 2 3.0 4.0") == [1.0, 2.0, 3.0, 4.0] + assert bounding_box.parse([[1, 2, 3, 4], [5, 6, 7, 8]]) == [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + ] + assert bounding_box.parse([1, 2, 3, 4, 5, 6, 7, 8]) == [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + ] with pytest.raises(ValueError, match="Wrong format"): bounding_box.parse(42) with pytest.raises(ValueError, match="should have a length of"): diff --git a/python/mlcroissant/mlcroissant/_src/core/optional.py b/python/mlcroissant/mlcroissant/_src/core/optional.py index ea65b67cd..c4cf8984d 100644 --- a/python/mlcroissant/mlcroissant/_src/core/optional.py +++ b/python/mlcroissant/mlcroissant/_src/core/optional.py @@ -91,6 +91,11 @@ def librosa(cls) -> types.ModuleType: # pylint: disable=invalid-name """Cached librosa module.""" return _try_import("librosa", package_name="librosa") + @cached_class_property + def orjson(cls) -> types.ModuleType: # pylint: disable=invalid-name + """Cached orjson module.""" + return _try_import("orjson", package_name="orjson") + @cached_class_property def scipy(cls) -> types.ModuleType: # pylint: disable=invalid-name """Cached scipy module.""" diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json.py index f2a929d82..7e30408b4 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json.py @@ -1,20 +1,229 @@ """Parse JSON operation.""" +import json +from typing import Any, TextIO + import jsonpath_rw import pandas as pd -from mlcroissant._src.core.types import Json +from mlcroissant._src.core.optional import deps from mlcroissant._src.structure_graph.nodes.field import Field +from mlcroissant._src.structure_graph.nodes.source import FileProperty + +try: + orjson = deps.orjson +except ModuleNotFoundError: + orjson = None + +def _unwrap_single_item(value: Any) -> Any: + """Unwraps a single-item list to its value, or returns the value as is.""" + if isinstance(value, list) and len(value) == 1: + if value[0] is None: + return None + return value[0] + return value -def parse_json_content(json: Json, fields: tuple[Field, ...]) -> pd.DataFrame: + +def parse_json_content(json_obj, fields): """Parsed all JSONs defined in the fields of RecordSet and outputs a pd.DF.""" series = {} for field in fields: json_path = field.source.extract.json_path - if json_path is None: + if not json_path: continue - jsonpath_expression = jsonpath_rw.parse(json_path) - values = [match.value for match in jsonpath_expression.find(json)] - series[json_path] = values + expr = jsonpath_rw.parse(json_path) + vals = [] + for match in expr.find(json_obj): + v = match.value + # If we got back a one‐item list, unwrap it. + if isinstance(v, list) and len(v) == 1: + v = v[0] + vals.append(v) + + # Unwrap the final list if it's a list of a single list. + vals = _unwrap_single_item(vals) + + series[json_path] = vals return pd.DataFrame(series) + + +class JsonReader: + """Parser for JSON files, supporting both JSONPath and JMESPath expressions.""" + + def __init__(self, fields: tuple[Field, ...]): + """Initializes the parser with a tuple of Field objects. + + Args: + fields (tuple[Field, ...]): A tuple of Field objects, each containing + a source with a JSON path to extract. + + The constructor builds a list of tuples for each field with a valid + JSON path: + - The original JSON path string. + - The engine used for evaluation ("jsonpath" for recursive-descent + paths, "jmespath" for simple direct paths). + - The compiled expression object for efficient evaluation. + + Fields without a JSON path are skipped. + """ + import jmespath + + # Build a list of (original_jsonpath, engine, compiled_expr). + self.exprs: list[tuple[str, str, Any]] = [] + for field in fields: + json_path = field.source.extract.json_path + if not json_path: + continue + + # Decide whether this path can be JMESPath or needs full JSONPath. + stripped = json_path.lstrip("$.") + if ".." in json_path: + # Uses recursive‐descent → fall back to jsonpath_ng. + expr = jsonpath_rw.parse(json_path) + engine = "jsonpath" + else: + # Simple direct path → use JMESPath. + expr = jmespath.compile(stripped) + engine = "jmespath" + + self.exprs.append((json_path, engine, expr)) + self.fields = fields + + def parse(self, fh: TextIO) -> pd.DataFrame: + """Parses a JSON file-like object and extracts data into a pandas DataFrame. + + Args: + fh (TextIO): A file-like object containing JSON data. + + Returns: + pd.DataFrame: DataFrame with extracted data, + where each column corresponds to an expression. + """ + # Load entire JSON file (could be a list or a single dict). + raw = fh.read() + data = orjson.loads(raw) if orjson else json.loads(raw) + + # Always treat as list of records. + records = data if isinstance(data, list) else [data] + + series: dict[str, list] = {} + for jp, engine, expr in self.exprs: + vals: list = [] + for rec in records: + if engine == "jmespath": + out = expr.search(rec) + # Unwrap single‐item lists. + out = _unwrap_single_item(out) + else: # Engine jsonpath_ng. + matches = expr.find(rec) + out = [m.value for m in matches] + out = _unwrap_single_item(out) + + vals.append(out) + + series[jp] = vals + + return pd.DataFrame(series) + + def raw(self, fh: TextIO) -> pd.DataFrame: + """Reads a JSON file-like object and returns a single-cell pandas DataFrame. + + The entire JSON content is loaded and placed in a DataFrame with one row + and one column, where the column name is specified by `FileProperty.content`. + + Args: + fh (TextIO): A file-like object opened for reading JSON data. + + Returns: + pd.DataFrame: A DataFrame containing the JSON content in a single cell. + """ + # Raw JSON fallback: one‐cell DataFrame. + raw = fh.read() + content = orjson.loads(raw) if orjson else json.loads(raw) + return pd.DataFrame({FileProperty.content: [content]}) + + +class JsonlReader: + """Parser for JSON Lines files, supporting both JSONPath and JMESPath.""" + + def __init__(self, fields): + """Initializes the parser with a list of fields. + + Args: + fields (list): A list of field objects, each expected to have a + `source.extract.json_path` attribute. + + The constructor processes each field's JSON path: + - If the path is a simple JSONPath (starts with "$." and does not + contain ".."), it is converted to a JMESPath expression and + compiled. + - Otherwise, the path is parsed and compiled using jsonpath_rw. + + Compiled expressions, along with their original paths and the engine + used, are stored in `self.exprs`. The original list of fields is stored + in `self.fields`. + """ + import jmespath + + self.exprs = [] # list of (orig_path, engine, compiled_expr) + for field in fields: + json_path = field.source.extract.json_path + if not json_path: + continue + + if json_path.startswith("$.") and ".." not in json_path: + # simple JSONPath → JMESPath + jm = json_path.lstrip("$.") # drop the "$." + expr = jmespath.compile(jm) + engine = "jmespath" + else: + # anything with recursive‐descent or complex filters + expr = jsonpath_rw.parse(json_path) + engine = "jsonpath" + + self.exprs.append((json_path, engine, expr)) + self.fields = fields + + def parse(self, fh): + """Parses a file-like object containing JSON objects (one per line). + + Args: + fh: A file-like object to read from, where each line is a JSON object. + + Returns: + pd.DataFrame: A DataFrame where each row corresponds to a parsed + JSON object with extracted fields. + + Notes: + - The extraction expressions are defined in self.exprs as tuples of + (json_path, engine, expr). + - For JMESPath, single-item lists are unwrapped to their value. + - For JSONPath, values are extracted from Match objects and + single-item lists are unwrapped. + """ + rows = [] + for line in fh: + line = line.strip() + if not line: + continue + rec = orjson.loads(line) if orjson else json.loads(line) + row: dict[str, object] = {} + for json_path, engine, expr in self.exprs: + if engine == "jmespath": + out = expr.search(rec) + # Unwrap single‐item lists. + out = _unwrap_single_item(out) + else: + matches = expr.find(rec) + temp = [m.value for m in matches] + # Unwrap single‐item lists. + out = _unwrap_single_item(temp) + row[json_path] = out + rows.append(row) + return pd.DataFrame(rows) + + def raw(self, fh: TextIO) -> pd.DataFrame: + """Reads a JSON Lines file-like object and returns a DataFrame.""" + fh.seek(0) + return pd.read_json(fh, lines=True) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json_test.py index 92b772133..7ddcd8f4e 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/parse_json_test.py @@ -1,7 +1,12 @@ """parse_json_test module.""" +import io +import json + import pandas as pd +from mlcroissant._src.operation_graph.operations.parse_json import JsonlReader +from mlcroissant._src.operation_graph.operations.parse_json import JsonReader from mlcroissant._src.operation_graph.operations.parse_json import parse_json_content from mlcroissant._src.structure_graph.nodes.source import Extract from mlcroissant._src.structure_graph.nodes.source import Source @@ -27,3 +32,77 @@ def test_parse_json(): data={"$.annotations[*].id": [1, 2], "$.annotations[*].value": [3, 4]} ) pd.testing.assert_frame_equal(parse_json_content(json, fields), expected_df) + + +def test_jsonreader_parse(): + # JsonReader.parse should extract values according to JSONPath + field = create_test_field( + source=Source(extract=Extract(json_path="$.item[*].value")) + ) + fields = (field,) + data = [{"item": [{"value": 10}]}, {"item": [{"value": 20}, {"value": 30}]}] + raw_str = json.dumps(data) + fh = io.StringIO(raw_str) + reader = JsonReader(fields=fields) + df = reader.parse(fh) + expected = pd.DataFrame({"$.item[*].value": [10, [20, 30]]}) + pd.testing.assert_frame_equal(df, expected) + + +def test_jsonreader_parse_deep(): + import orjson + + # Test nested JSONPath ($.level1.level2[*].value) + field = create_test_field( + source=Source(extract=Extract(json_path="$.level1.level2[*].value")) + ) + fields = (field,) + json_obj = {"level1": {"level2": [{"value": 100}, {"value": 200}]}} + expected_df = pd.DataFrame({"$.level1.level2[*].value": [[100, 200]]}) + raw_str = orjson.dumps(json_obj).decode("utf-8") + fh = io.StringIO(raw_str) + reader = JsonReader(fields=fields) + df = reader.parse(fh) + pd.testing.assert_frame_equal(df, expected_df) + + +def test_jsonlreader_raw(): + # JsonlReader.raw should read JSON Lines into a DataFrame + lines = [{"a": 1}, {"a": 2}] + raw_text = "\n".join(json.dumps(rec) for rec in lines) + fh = io.StringIO(raw_text) + reader = JsonlReader(fields=()) + df = reader.raw(fh) + expected = pd.DataFrame(lines) + pd.testing.assert_frame_equal(df, expected) + + +def test_jsonlreader_parse(): + # JsonlReader.parse should extract values across lines + field = create_test_field(source=Source(extract=Extract(json_path="$.x"))) + fields = (field,) + lines = [{"x": 5}, {"x": 6}] + raw_text = "\n".join(json.dumps(rec) for rec in lines) + fh = io.StringIO(raw_text) + reader = JsonlReader(fields=fields) + df = reader.parse(fh) + expected = pd.DataFrame({"$.x": [5, 6]}) + pd.testing.assert_frame_equal(df, expected) + + +def test_jsonlreader_deeper_path(): + # JsonlReader.parse should handle nested deeper JSONPath + field = create_test_field( + source=Source(extract=Extract(json_path="$.meta.detail[*].info")) + ) + fields = (field,) + records = [ + {"meta": {"detail": [{"info": "a"}, {"info": "b"}]}}, + {"meta": {"detail": [{"info": "c"}]}}, + ] + raw_text = "\n".join(json.dumps(rec) for rec in records) + fh = io.StringIO(raw_text) + reader = JsonlReader(fields=fields) + df = reader.parse(fh) + expected = pd.DataFrame({"$.meta.detail[*].info": [["a", "b"], "c"]}) + pd.testing.assert_frame_equal(df, expected) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py index 96ef7a881..de9ae6208 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py @@ -4,7 +4,6 @@ import enum import gzip import io -import json import pathlib from etils import epath @@ -18,7 +17,8 @@ from mlcroissant._src.core.path import Path from mlcroissant._src.operation_graph.base_operation import Operation from mlcroissant._src.operation_graph.operations.download import is_url -from mlcroissant._src.operation_graph.operations.parse_json import parse_json_content +from mlcroissant._src.operation_graph.operations.parse_json import JsonlReader +from mlcroissant._src.operation_graph.operations.parse_json import JsonReader from mlcroissant._src.structure_graph.nodes.field import Field from mlcroissant._src.structure_graph.nodes.file_object import FileObject from mlcroissant._src.structure_graph.nodes.file_set import FileSet @@ -116,6 +116,7 @@ def _read_file_content( if EncodingFormat.ARFF in encoding_formats: return _read_arff_file(filepath) + reader: JsonReader | JsonlReader | None = None with filepath.open("rb") as file: for encoding_format in encoding_formats: # TODO(https://github.com/mlcommons/croissant/issues/635). @@ -126,16 +127,19 @@ def _read_file_content( elif encoding_format == EncodingFormat.TSV: return pd.read_csv(file, sep="\t") elif encoding_format == EncodingFormat.JSON: - json_content = json.load(file) + reader = JsonReader(self.fields) if reading_method == ReadingMethod.JSON: - return parse_json_content(json_content, self.fields) - else: - # Raw files are returned as a one-line pd.DataFrame. - return pd.DataFrame({ - FileProperty.content: [json_content], - }) - elif encoding_format == EncodingFormat.JSON_LINES: - return pd.read_json(file, lines=True) + return reader.parse(file) + return reader.raw(file) + elif encoding_format in ( + EncodingFormat.JSON_LINES, + EncodingFormat.FHIR, + ): + # JSON_LINES and FHIR do the same thing + reader = JsonlReader(self.fields) + if reading_method == ReadingMethod.JSON: + return reader.parse(file) + return reader.raw(file) elif encoding_format == EncodingFormat.PARQUET: try: df = pd.read_parquet(file) diff --git a/python/mlcroissant/pyproject.toml b/python/mlcroissant/pyproject.toml index 9600712d8..d224bcbaf 100644 --- a/python/mlcroissant/pyproject.toml +++ b/python/mlcroissant/pyproject.toml @@ -21,11 +21,13 @@ authors = [ dependencies = [ "absl-py", "etils[epath]>=1.7.0", + "jmespath", "jsonpath-rw", "networkx", "pandas", "pandas-stubs", "python-dateutil", + "orjson", "rdflib", "requests", "scipy", @@ -94,9 +96,11 @@ module = [ "absl", "apache-beam", "etils.*", + "jmespath", "jsonpath_rw", "librosa", "networkx", + "orjson", "pandas", "pillow", "scipy", diff --git a/python/mlcroissant/recipes/bounding-boxes.ipynb b/python/mlcroissant/recipes/bounding-boxes.ipynb index 383106ef8..237fc22b8 100644 --- a/python/mlcroissant/recipes/bounding-boxes.ipynb +++ b/python/mlcroissant/recipes/bounding-boxes.ipynb @@ -165,7 +165,10 @@ "metadata": {}, "outputs": [], "source": [ - "image_id, bbox = record[\"images_with_bounding_box/image_id\"], record[\"images_with_bounding_box/bbox\"]\n", + "image_id, bbox = (\n", + " record[\"images_with_bounding_box/image_id\"][0],\n", + " record[\"images_with_bounding_box/bbox\"][0],\n", + ")\n", "url = f\"http://images.cocodataset.org/val2014/COCO_val2014_{image_id:012d}.jpg\"\n", "\n", "# Download the image\n", @@ -179,19 +182,11 @@ "draw.rectangle((x1, y1, x1 + w, y1 + h), outline=(0, 255, 0), width=2)\n", "display(image)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ea44fc8-bdd2-44d8-84de-7b6195ff5409", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "croissant", "language": "python", "name": "python3" }, @@ -205,7 +200,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/python/mlcroissant/recipes/flores200_datapipes.ipynb b/python/mlcroissant/recipes/flores200_datapipes.ipynb index d7451570b..f114860f6 100644 --- a/python/mlcroissant/recipes/flores200_datapipes.ipynb +++ b/python/mlcroissant/recipes/flores200_datapipes.ipynb @@ -92,7 +92,9 @@ "metadata": {}, "outputs": [], "source": [ - "ta_factory = mlc.torch.LoaderFactory(jsonld=\"../../../datasets/1.0/flores-200/metadata.json\")\n", + "ta_factory = mlc.torch.LoaderFactory(\n", + " jsonld=\"../../../datasets/1.0/flores-200/metadata.json\"\n", + ")\n", "specification = {\n", " \"translation\": mlc.torch.LoaderSpecificationDataType.INFER,\n", " \"language\": mlc.torch.LoaderSpecificationDataType.INFER,\n", @@ -273,6 +275,7 @@ " tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)\n", " return tokenizer\n", "\n", + "\n", "def unpack_row(x):\n", " tokenizer = get_tokenizer()\n", " tokenized = tokenizer(\n", @@ -695,7 +698,9 @@ "# We allow fractional epochs to speed up testing\n", "# So 1.5 epochs would be 1 epoch plus half the next epoch\n", "whole_epochs = int(NUM_EPOCHS)\n", - "remainder_train_steps = (NUM_EPOCHS - whole_epochs) * NUM_TRAIN_SAMPLES / TRAIN_BATCH_SIZE\n", + "remainder_train_steps = (\n", + " (NUM_EPOCHS - whole_epochs) * NUM_TRAIN_SAMPLES / TRAIN_BATCH_SIZE\n", + ")\n", "max_test_steps = None\n", "if FRACTION_TEST != 1.0:\n", " max_test_steps = FRACTION_TEST * NUM_TEST_SAMPLES / TEST_BATCH_SIZE\n", @@ -709,7 +714,9 @@ " f\"test_loss: {test_loss:.2f}, test_accuracy: {test_accuracy:.1%}\"\n", " )\n", "if remainder_train_steps:\n", - " train_loss = train(model, optimizer, train_dataloader, max_steps=remainder_train_steps)\n", + " train_loss = train(\n", + " model, optimizer, train_dataloader, max_steps=remainder_train_steps\n", + " )\n", " test_loss, test_accuracy = test(model, test_dataloader, max_steps=max_test_steps)\n", " print(\n", " f\"Epoch {epoch} train_loss: {train_loss:.2f}, \"\n", diff --git a/python/mlcroissant/recipes/introduction.ipynb b/python/mlcroissant/recipes/introduction.ipynb index e5c3dbac8..712dd200d 100644 --- a/python/mlcroissant/recipes/introduction.ipynb +++ b/python/mlcroissant/recipes/introduction.ipynb @@ -1,323 +1,322 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "AriH9CP6AKhs" - }, - "source": [ - "# Tutorial for `mlcroissant` 🥐" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Hh-0cehIAErA" - }, - "source": [ - "## Introduction\n", - "\n", - "Croissant 🥐 is a high-level format for machine learning datasets that combines metadata, resource file descriptions, data structure, and default ML semantics into a single file.\n", - "\n", - "Croissant builds on schema.org, and its `sc:Dataset` vocabulary, a widely used format to represent datasets on the Web, and make them searchable.\n", - "\n", - "The [`mlcroissant`](https://github.com/mlcommons/croissant/python/mlcroissant) Python library empowers developers to interact with Croissant:\n", - "\n", - "- Programmatically write your JSON-LD Croissant files.\n", - "- Verify your JSON-LD Croissant files.\n", - "- Load data from Croissant datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0qpWrlwV-x52" - }, - "outputs": [], - "source": [ - "# Install mlcroissant from the source\n", - "!apt-get install -y python3-dev graphviz libgraphviz-dev pkg-config\n", - "!pip install \"git+https://github.com/${GITHUB_REPOSITORY:-mlcommons/croissant}.git@${GITHUB_HEAD_REF:-main}#subdirectory=python/mlcroissant&egg=mlcroissant[dev]\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Xwrol5JR_GTY" - }, - "source": [ - "## Example\n", - "\n", - "Let's try on a very concrete dataset: OpenAI's [`gpt-3`](https://github.com/openai/gpt-3) dataset for LLMs!\n", - "\n", - "In the tutorial, we will generate programmatically the Croissant JSON-LD file describing the dataset. Then we will verify the file and yield data from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7OyQffJv-zso" - }, - "outputs": [], - "source": [ - "import mlcroissant as mlc\n", - "\n", - "# FileObjects and FileSets define the resources of the dataset.\n", - "distribution = [\n", - " # gpt-3 is hosted on a GitHub repository:\n", - " mlc.FileObject(\n", - " id=\"github-repository\",\n", - " name=\"github-repository\",\n", - " description=\"OpenAI repository on GitHub.\",\n", - " content_url=\"https://github.com/openai/gpt-3\",\n", - " encoding_formats=[\"git+https\"],\n", - " sha256=\"main\",\n", - " ),\n", - " # Within that repository, a FileSet lists all JSONL files:\n", - " mlc.FileSet(\n", - " id=\"jsonl-files\",\n", - " name=\"jsonl-files\",\n", - " description=\"JSONL files are hosted on the GitHub repository.\",\n", - " contained_in=[\"github-repository\"],\n", - " encoding_formats=[\"application/jsonlines\"],\n", - " includes=\"data/*.jsonl\",\n", - " ),\n", - "]\n", - "record_sets = [\n", - " # RecordSets contains records in the dataset.\n", - " mlc.RecordSet(\n", - " id=\"jsonl\",\n", - " name=\"jsonl\",\n", - " # Each record has one or many fields...\n", - " fields=[\n", - " # Fields can be extracted from the FileObjects/FileSets.\n", - " mlc.Field(\n", - " id=\"jsonl/context\",\n", - " name=\"context\",\n", - " description=\"\",\n", - " data_types=mlc.DataType.TEXT,\n", - " source=mlc.Source(\n", - " file_set=\"jsonl-files\",\n", - " # Extract the field from the column of a FileObject/FileSet:\n", - " extract=mlc.Extract(column=\"context\"),\n", - " ),\n", - " ),\n", - " mlc.Field(\n", - " id=\"jsonl/completion\",\n", - " name=\"completion\",\n", - " description=\"The expected completion of the promt.\",\n", - " data_types=mlc.DataType.TEXT,\n", - " source=mlc.Source(\n", - " file_set=\"jsonl-files\",\n", - " extract=mlc.Extract(column=\"completion\"),\n", - " ),\n", - " ),\n", - " mlc.Field(\n", - " id=\"jsonl/task\",\n", - " name=\"task\",\n", - " description=(\n", - " \"The machine learning task appearing as the name of the\"\n", - " \" file.\"\n", - " ),\n", - " data_types=mlc.DataType.TEXT,\n", - " source=mlc.Source(\n", - " file_set=\"jsonl-files\",\n", - " extract=mlc.Extract(\n", - " file_property=mlc._src.structure_graph.nodes.source.FileProperty.filename\n", - " ),\n", - " # Extract the field from a regex on the filename:\n", - " transforms=[mlc.Transform(regex=\"^(.*)\\\\.jsonl$\")],\n", - " ),\n", - " ),\n", - " ],\n", - " )\n", - "]\n", - "\n", - "# Metadata contains information about the dataset.\n", - "metadata = mlc.Metadata(\n", - " name=\"gpt-3\",\n", - " # Descriptions can contain plain text or markdown.\n", - " description=(\n", - " \"Recent work has demonstrated substantial gains on many NLP tasks and\"\n", - " \" benchmarks by pre-training on a large corpus of text followed by\"\n", - " \" fine-tuning on a specific task. While typically task-agnostic in\"\n", - " \" architecture, this method still requires task-specific fine-tuning\"\n", - " \" datasets of thousands or tens of thousands of examples. By contrast,\"\n", - " \" humans can generally perform a new language task from only a few\"\n", - " \" examples or from simple instructions \\u2013 something which current\"\n", - " \" NLP systems still largely struggle to do. Here we show that scaling\"\n", - " \" up language models greatly improves task-agnostic, few-shot\"\n", - " \" performance, sometimes even reaching competitiveness with prior\"\n", - " \" state-of-the-art fine-tuning approaches. Specifically, we train\"\n", - " \" GPT-3, an autoregressive language model with 175 billion parameters,\"\n", - " \" 10x more than any previous non-sparse language model, and test its\"\n", - " \" performance in the few-shot setting. For all tasks, GPT-3 is applied\"\n", - " \" without any gradient updates or fine-tuning, with tasks and few-shot\"\n", - " \" demonstrations specified purely via text interaction with the model.\"\n", - " \" GPT-3 achieves strong performance on many NLP datasets, including\"\n", - " \" translation, question-answering, and cloze tasks, as well as several\"\n", - " \" tasks that require on-the-fly reasoning or domain adaptation, such as\"\n", - " \" unscrambling words, using a novel word in a sentence, or performing\"\n", - " \" 3-digit arithmetic. At the same time, we also identify some datasets\"\n", - " \" where GPT-3's few-shot learning still struggles, as well as some\"\n", - " \" datasets where GPT-3 faces methodological issues related to training\"\n", - " \" on large web corpora. Finally, we find that GPT-3 can generate\"\n", - " \" samples of news articles which human evaluators have difficulty\"\n", - " \" distinguishing from articles written by humans. We discuss broader\"\n", - " \" societal impacts of this finding and of GPT-3 in general.\"\n", - " ),\n", - " cite_as=(\n", - " \"@article{brown2020language, title={Language Models are Few-Shot\"\n", - " \" Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and\"\n", - " \" Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind\"\n", - " \" Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and\"\n", - " \" Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom\"\n", - " \" Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and\"\n", - " \" Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and\"\n", - " \" Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and\"\n", - " \" Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford\"\n", - " \" and Ilya Sutskever and Dario Amodei}, year={2020},\"\n", - " \" eprint={2005.14165}, archivePrefix={arXiv}, primaryClass={cs.CL} }\"\n", - " ),\n", - " url=\"https://github.com/openai/gpt-3\",\n", - " distribution=distribution,\n", - " record_sets=record_sets,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2RUVgWI-DldZ" - }, - "source": [ - "When creating `Metadata`:\n", - "- We also check for errors in the configuration.\n", - "- We generate warnings if the configuration doesn't follow guidelines and best practices.\n", - "\n", - "For instance, in this case:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AENcJUwMCd1B" - }, - "outputs": [], - "source": [ - "print(metadata.issues.report())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vES3KHaND4P2" - }, - "source": [ - "`Property \"https://schema.org/license\" is recommended`...\n", - "\n", - "We can see at a glance that we miss an important metadata to build datasets for responsible AI: the license!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S0BEhzqiEjd0" - }, - "source": [ - "## Build the Croissant file and yield data\n", - "\n", - "Let's write the Croissant JSON-LD to a file on disk!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-XCycu81ECVq" - }, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"croissant.json\", \"w\") as f:\n", - " content = metadata.to_json()\n", - " content = json.dumps(content, indent=2)\n", - " print(content)\n", - " f.write(content)\n", - " f.write(\"\\n\") # Terminate file with newline" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ypb_ll3SE6UU" - }, - "source": [ - "From this JSON-LD file, we can easily create a dataset..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_JNyQFuAEiIs" - }, - "outputs": [], - "source": [ - "dataset = mlc.Dataset(jsonld=\"croissant.json\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldwdIGPoFT_p" - }, - "source": [ - "...and yield records from this dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MHdVY4TBEqZ8" - }, - "outputs": [], - "source": [ - "records = dataset.records(record_set=\"jsonl\")\n", - "\n", - "for i, record in enumerate(records):\n", - " print(record)\n", - " if i > 10:\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8a2sCy0GFYCQ" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "AriH9CP6AKhs" + }, + "source": [ + "# Tutorial for `mlcroissant` 🥐" + ] }, - "nbformat": 4, - "nbformat_minor": 0 + { + "cell_type": "markdown", + "metadata": { + "id": "Hh-0cehIAErA" + }, + "source": [ + "## Introduction\n", + "\n", + "Croissant 🥐 is a high-level format for machine learning datasets that combines metadata, resource file descriptions, data structure, and default ML semantics into a single file.\n", + "\n", + "Croissant builds on schema.org, and its `sc:Dataset` vocabulary, a widely used format to represent datasets on the Web, and make them searchable.\n", + "\n", + "The [`mlcroissant`](https://github.com/mlcommons/croissant/python/mlcroissant) Python library empowers developers to interact with Croissant:\n", + "\n", + "- Programmatically write your JSON-LD Croissant files.\n", + "- Verify your JSON-LD Croissant files.\n", + "- Load data from Croissant datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0qpWrlwV-x52" + }, + "outputs": [], + "source": [ + "# Install mlcroissant from the source\n", + "!apt-get install -y python3-dev graphviz libgraphviz-dev pkg-config\n", + "!pip install \"git+https://github.com/${GITHUB_REPOSITORY:-mlcommons/croissant}.git@${GITHUB_HEAD_REF:-main}#subdirectory=python/mlcroissant&egg=mlcroissant[dev]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xwrol5JR_GTY" + }, + "source": [ + "## Example\n", + "\n", + "Let's try on a very concrete dataset: OpenAI's [`gpt-3`](https://github.com/openai/gpt-3) dataset for LLMs!\n", + "\n", + "In the tutorial, we will generate programmatically the Croissant JSON-LD file describing the dataset. Then we will verify the file and yield data from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7OyQffJv-zso" + }, + "outputs": [], + "source": [ + "import mlcroissant as mlc\n", + "\n", + "# FileObjects and FileSets define the resources of the dataset.\n", + "distribution = [\n", + " # gpt-3 is hosted on a GitHub repository:\n", + " mlc.FileObject(\n", + " id=\"github-repository\",\n", + " name=\"github-repository\",\n", + " description=\"OpenAI repository on GitHub.\",\n", + " content_url=\"https://github.com/openai/gpt-3\",\n", + " encoding_formats=[\"git+https\"],\n", + " sha256=\"main\",\n", + " ),\n", + " # Within that repository, a FileSet lists all JSONL files:\n", + " mlc.FileSet(\n", + " id=\"jsonl-files\",\n", + " name=\"jsonl-files\",\n", + " description=\"JSONL files are hosted on the GitHub repository.\",\n", + " contained_in=[\"github-repository\"],\n", + " encoding_formats=[\"application/jsonlines\"],\n", + " includes=\"data/*.jsonl\",\n", + " ),\n", + "]\n", + "record_sets = [\n", + " # RecordSets contains records in the dataset.\n", + " mlc.RecordSet(\n", + " id=\"jsonl\",\n", + " name=\"jsonl\",\n", + " # Each record has one or many fields...\n", + " fields=[\n", + " # Fields can be extracted from the FileObjects/FileSets.\n", + " mlc.Field(\n", + " id=\"jsonl/context\",\n", + " name=\"context\",\n", + " description=\"\",\n", + " data_types=mlc.DataType.TEXT,\n", + " source=mlc.Source(\n", + " file_set=\"jsonl-files\",\n", + " # Extract the field from the column of a FileObject/FileSet:\n", + " extract=mlc.Extract(column=\"context\"),\n", + " ),\n", + " ),\n", + " mlc.Field(\n", + " id=\"jsonl/completion\",\n", + " name=\"completion\",\n", + " description=\"The expected completion of the promt.\",\n", + " data_types=mlc.DataType.TEXT,\n", + " source=mlc.Source(\n", + " file_set=\"jsonl-files\",\n", + " extract=mlc.Extract(column=\"completion\"),\n", + " ),\n", + " ),\n", + " mlc.Field(\n", + " id=\"jsonl/task\",\n", + " name=\"task\",\n", + " description=(\n", + " \"The machine learning task appearing as the name of the file.\"\n", + " ),\n", + " data_types=mlc.DataType.TEXT,\n", + " source=mlc.Source(\n", + " file_set=\"jsonl-files\",\n", + " extract=mlc.Extract(\n", + " file_property=mlc._src.structure_graph.nodes.source.FileProperty.filename\n", + " ),\n", + " # Extract the field from a regex on the filename:\n", + " transforms=[mlc.Transform(regex=\"^(.*)\\\\.jsonl$\")],\n", + " ),\n", + " ),\n", + " ],\n", + " )\n", + "]\n", + "\n", + "# Metadata contains information about the dataset.\n", + "metadata = mlc.Metadata(\n", + " name=\"gpt-3\",\n", + " # Descriptions can contain plain text or markdown.\n", + " description=(\n", + " \"Recent work has demonstrated substantial gains on many NLP tasks and\"\n", + " \" benchmarks by pre-training on a large corpus of text followed by\"\n", + " \" fine-tuning on a specific task. While typically task-agnostic in\"\n", + " \" architecture, this method still requires task-specific fine-tuning\"\n", + " \" datasets of thousands or tens of thousands of examples. By contrast,\"\n", + " \" humans can generally perform a new language task from only a few\"\n", + " \" examples or from simple instructions \\u2013 something which current\"\n", + " \" NLP systems still largely struggle to do. Here we show that scaling\"\n", + " \" up language models greatly improves task-agnostic, few-shot\"\n", + " \" performance, sometimes even reaching competitiveness with prior\"\n", + " \" state-of-the-art fine-tuning approaches. Specifically, we train\"\n", + " \" GPT-3, an autoregressive language model with 175 billion parameters,\"\n", + " \" 10x more than any previous non-sparse language model, and test its\"\n", + " \" performance in the few-shot setting. For all tasks, GPT-3 is applied\"\n", + " \" without any gradient updates or fine-tuning, with tasks and few-shot\"\n", + " \" demonstrations specified purely via text interaction with the model.\"\n", + " \" GPT-3 achieves strong performance on many NLP datasets, including\"\n", + " \" translation, question-answering, and cloze tasks, as well as several\"\n", + " \" tasks that require on-the-fly reasoning or domain adaptation, such as\"\n", + " \" unscrambling words, using a novel word in a sentence, or performing\"\n", + " \" 3-digit arithmetic. At the same time, we also identify some datasets\"\n", + " \" where GPT-3's few-shot learning still struggles, as well as some\"\n", + " \" datasets where GPT-3 faces methodological issues related to training\"\n", + " \" on large web corpora. Finally, we find that GPT-3 can generate\"\n", + " \" samples of news articles which human evaluators have difficulty\"\n", + " \" distinguishing from articles written by humans. We discuss broader\"\n", + " \" societal impacts of this finding and of GPT-3 in general.\"\n", + " ),\n", + " cite_as=(\n", + " \"@article{brown2020language, title={Language Models are Few-Shot\"\n", + " \" Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and\"\n", + " \" Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind\"\n", + " \" Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and\"\n", + " \" Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom\"\n", + " \" Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and\"\n", + " \" Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and\"\n", + " \" Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and\"\n", + " \" Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford\"\n", + " \" and Ilya Sutskever and Dario Amodei}, year={2020},\"\n", + " \" eprint={2005.14165}, archivePrefix={arXiv}, primaryClass={cs.CL} }\"\n", + " ),\n", + " url=\"https://github.com/openai/gpt-3\",\n", + " distribution=distribution,\n", + " record_sets=record_sets,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2RUVgWI-DldZ" + }, + "source": [ + "When creating `Metadata`:\n", + "- We also check for errors in the configuration.\n", + "- We generate warnings if the configuration doesn't follow guidelines and best practices.\n", + "\n", + "For instance, in this case:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AENcJUwMCd1B" + }, + "outputs": [], + "source": [ + "print(metadata.issues.report())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vES3KHaND4P2" + }, + "source": [ + "`Property \"https://schema.org/license\" is recommended`...\n", + "\n", + "We can see at a glance that we miss an important metadata to build datasets for responsible AI: the license!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S0BEhzqiEjd0" + }, + "source": [ + "## Build the Croissant file and yield data\n", + "\n", + "Let's write the Croissant JSON-LD to a file on disk!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-XCycu81ECVq" + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open(\"croissant.json\", \"w\") as f:\n", + " content = metadata.to_json()\n", + " content = json.dumps(content, indent=2)\n", + " print(content)\n", + " f.write(content)\n", + " f.write(\"\\n\") # Terminate file with newline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ypb_ll3SE6UU" + }, + "source": [ + "From this JSON-LD file, we can easily create a dataset..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_JNyQFuAEiIs" + }, + "outputs": [], + "source": [ + "dataset = mlc.Dataset(jsonld=\"croissant.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldwdIGPoFT_p" + }, + "source": [ + "...and yield records from this dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MHdVY4TBEqZ8" + }, + "outputs": [], + "source": [ + "records = dataset.records(record_set=\"jsonl\")\n", + "\n", + "for i, record in enumerate(records):\n", + " print(record)\n", + " if i > 10:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8a2sCy0GFYCQ" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/python/mlcroissant/recipes/tfds_croissant_builder.ipynb b/python/mlcroissant/recipes/tfds_croissant_builder.ipynb index 18e207283..a01bf5d59 100644 --- a/python/mlcroissant/recipes/tfds_croissant_builder.ipynb +++ b/python/mlcroissant/recipes/tfds_croissant_builder.ipynb @@ -212,9 +212,9 @@ "# Download the JSON and write it to `local_croissant_file`.\n", "response = requests.get(api_url, headers=None).json()\n", "with local_croissant_file.open(\"w\") as f:\n", - " jsonld = json.dumps(response, indent=2)\n", - " f.write(jsonld)\n", - " print(jsonld)" + " jsonld = json.dumps(response, indent=2)\n", + " f.write(jsonld)\n", + " print(jsonld)" ] }, { @@ -254,7 +254,7 @@ "builder = tfds.core.dataset_builders.CroissantBuilder(\n", " jsonld=local_croissant_file,\n", " record_set_ids=[\"fashion_mnist\"],\n", - " file_format='array_record',\n", + " file_format=\"array_record\",\n", " data_dir=data_dir,\n", ")" ] @@ -418,7 +418,7 @@ }, "outputs": [], "source": [ - "train, test = builder.as_data_source(split=['train', 'test'])" + "train, test = builder.as_data_source(split=[\"train\", \"test\"])" ] }, { @@ -513,14 +513,15 @@ ], "source": [ "class LinearClassifier(torch.nn.Module):\n", - " def __init__(self, shape, num_classes):\n", - " super(LinearClassifier, self).__init__()\n", - " height, width, channels = shape\n", - " self.classifier = torch.nn.Linear(height * width * channels, num_classes)\n", + " def __init__(self, shape, num_classes):\n", + " super(LinearClassifier, self).__init__()\n", + " height, width, channels = shape\n", + " self.classifier = torch.nn.Linear(height * width * channels, num_classes)\n", + "\n", + " def forward(self, image):\n", + " image = image.view(image.size()[0], -1).to(torch.float32)\n", + " return self.classifier(image)\n", "\n", - " def forward(self, image):\n", - " image = image.view(image.size()[0], -1).to(torch.float32)\n", - " return self.classifier(image)\n", "\n", "shape = train[0][\"image\"].shape\n", "num_classes = 10\n", @@ -528,29 +529,29 @@ "optimizer = torch.optim.Adam(model.parameters())\n", "loss_function = torch.nn.CrossEntropyLoss()\n", "\n", - "print('Training...')\n", + "print(\"Training...\")\n", "model.train()\n", "for example in tqdm(train_loader):\n", - " image = example['image']\n", - " label = example['label']\n", - " prediction = model(image)\n", - " loss = loss_function(prediction, label)\n", - " optimizer.zero_grad()\n", - " loss.backward()\n", - " optimizer.step()\n", + " image = example[\"image\"]\n", + " label = example[\"label\"]\n", + " prediction = model(image)\n", + " loss = loss_function(prediction, label)\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", "\n", - "print('Testing...')\n", + "print(\"Testing...\")\n", "model.eval()\n", "num_examples = 0\n", "true_positives = 0\n", "for example in tqdm(test_loader):\n", - " image = example['image']\n", - " label = example['label']\n", - " prediction = model(image)\n", - " num_examples += image.shape[0]\n", - " predicted_label = prediction.argmax(dim=1)\n", - " true_positives += (predicted_label == label).sum().item()\n", - "print(f'\\nAccuracy: {true_positives/num_examples * 100:.2f}%')" + " image = example[\"image\"]\n", + " label = example[\"label\"]\n", + " prediction = model(image)\n", + " num_examples += image.shape[0]\n", + " predicted_label = prediction.argmax(dim=1)\n", + " true_positives += (predicted_label == label).sum().item()\n", + "print(f\"\\nAccuracy: {true_positives/num_examples * 100:.2f}%\")" ] }, {