From 1b4d1d304f850094677ec12fb4661a6fb9bcad08 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 27 Aug 2025 17:32:37 +0300 Subject: [PATCH 1/2] Fix memory issue in excel_parser --- .../file_based/file_types/excel_parser.py | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 5a0332171..b1acd356d 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -5,7 +5,11 @@ import logging from io import IOBase from pathlib import Path +import sys +import tempfile +import io from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union +from collections import deque import orjson import pandas as pd @@ -31,6 +35,23 @@ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType + +def iter_records_via_tempfile(df: pd.DataFrame): + """ + Stream records using Pandas' to_json (so datetime strings match exactly), + without building a giant string in RAM. + + - Writes NDJSON to a temporary file (text-wrapped over a binary file) + - Reads back line-by-line and yields parsed dicts + """ + with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8", delete=True) as f: + df.to_json(f, orient="records", lines=True, date_format="iso", date_unit="us") + f.seek(0) + for line in f: # line is str + if line.strip(): + yield orjson.loads(line) + + class ExcelParser(FileTypeParser): ENCODING = None @@ -118,9 +139,9 @@ def parse_records( # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior # see PR description: https://github.com/airbytehq/airbyte/pull/44444/ - yield from orjson.loads( - df.to_json(orient="records", date_format="iso", date_unit="us") - ) + for index, row in df.iterrows(): + # Convert each row (as a Series) to a JSON string + yield orjson.loads(row.to_json(date_format="iso", date_unit="us")) except Exception as exc: # Raise a RecordParseError if any exception occurs during parsing From 7424cc67fa840280604e30535e17b43b2a189528 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 27 Aug 2025 17:34:52 +0300 Subject: [PATCH 2/2] Delete unused code --- .../file_based/file_types/excel_parser.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index b1acd356d..9026799ae 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -5,11 +5,7 @@ import logging from io import IOBase from pathlib import Path -import sys -import tempfile -import io from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union -from collections import deque import orjson import pandas as pd @@ -35,23 +31,6 @@ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType - -def iter_records_via_tempfile(df: pd.DataFrame): - """ - Stream records using Pandas' to_json (so datetime strings match exactly), - without building a giant string in RAM. - - - Writes NDJSON to a temporary file (text-wrapped over a binary file) - - Reads back line-by-line and yields parsed dicts - """ - with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8", delete=True) as f: - df.to_json(f, orient="records", lines=True, date_format="iso", date_unit="us") - f.seek(0) - for line in f: # line is str - if line.strip(): - yield orjson.loads(line) - - class ExcelParser(FileTypeParser): ENCODING = None