diff --git a/benchmark.py b/benchmark.py index 3d71e22..4f1cb53 100644 --- a/benchmark.py +++ b/benchmark.py @@ -25,6 +25,8 @@ from pdf_benchmark.data_structures import Cache, Document, Library from pdf_benchmark.library_code import ( borb_get_text, + pdf_oxide_get_text, + pdf_oxide_image_extraction, pdfium_get_text, pdfminer_image_extraction, pdfplubmer_get_text, @@ -228,6 +230,17 @@ def write_single_result( # license="AGPL/Commercial", # last_release_date="2023-06-23", # ), + "pdf_oxide": Library( + "pdf_oxide", + "pdf_oxide", + "https://pypi.org/project/pdf-oxide/", + text_extraction_function=pdf_oxide_get_text, + version="0.3.6", + image_extraction_function=pdf_oxide_image_extraction, + license="MIT OR Apache-2.0", + last_release_date="2026-02-16", + dependencies="Rust core via PyO3", + ), "pdfium": Library( "pypdfium2", "pdfium", diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py index 32938c2..80badbe 100644 --- a/pdf_benchmark/library_code.py +++ b/pdf_benchmark/library_code.py @@ -225,6 +225,41 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes: return out_buffer.read() +def pdf_oxide_get_text(data: bytes) -> str: + import pdf_oxide + new_file, filename = tempfile.mkstemp(suffix=".pdf") + try: + with open(filename, "wb") as fp: + fp.write(data) + doc = pdf_oxide.PdfDocument(filename) + text = "" + for i in range(doc.page_count): + text += doc.extract_text(i) + "\n" + finally: + os.close(new_file) + os.remove(filename) + return text + + +def pdf_oxide_image_extraction(data: bytes) -> list[tuple[str, bytes]]: + import pdf_oxide + images = [] + new_file, filename = tempfile.mkstemp(suffix=".pdf") + try: + with open(filename, "wb") as fp: + fp.write(data) + doc = pdf_oxide.PdfDocument(filename) + for i in range(doc.page_count): + for img_index, img in enumerate(doc.extract_images(i), start=1): + images.append((f"page-{i+1}-image-{img_index}.{img.format}", img.data)) + except Exception as exc: + print(f"pdf_oxide Image extraction failure: {exc}") + finally: + os.close(new_file) + os.remove(filename) + return images + + def tika_get_text(data: bytes) -> str: from tika import parser diff --git a/requirements/main.in b/requirements/main.in index d277654..db76fd9 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -1,5 +1,6 @@ borb numpy +pdf-oxide pdfminer.six pdfplumber pypdf