py-pdf · yfedoseev · Feb 16, 2026
diff --git a/benchmark.py b/benchmark.py
@@ -25,6 +25,8 @@
 from pdf_benchmark.data_structures import Cache, Document, Library
 from pdf_benchmark.library_code import (
     borb_get_text,
+    pdf_oxide_get_text,
+    pdf_oxide_image_extraction,
     pdfium_get_text,
     pdfminer_image_extraction,
     pdfplubmer_get_text,
@@ -228,6 +230,17 @@ def write_single_result(
         #     license="AGPL/Commercial",
         #     last_release_date="2023-06-23",
         # ),
+        "pdf_oxide": Library(
+            "pdf_oxide",
+            "pdf_oxide",
+            "https://pypi.org/project/pdf-oxide/",
+            text_extraction_function=pdf_oxide_get_text,
+            version="0.3.6",
+            image_extraction_function=pdf_oxide_image_extraction,
+            license="MIT OR Apache-2.0",
+            last_release_date="2026-02-16",
+            dependencies="Rust core via PyO3",
+        ),
         "pdfium": Library(
             "pypdfium2",
             "pdfium",

diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py
@@ -225,6 +225,41 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes:
     return out_buffer.read()
 
 
+def pdf_oxide_get_text(data: bytes) -> str:
+    import pdf_oxide
+    new_file, filename = tempfile.mkstemp(suffix=".pdf")
+    try:
+        with open(filename, "wb") as fp:
+            fp.write(data)
+        doc = pdf_oxide.PdfDocument(filename)
+        text = ""
+        for i in range(doc.page_count):
+            text += doc.extract_text(i) + "\n"
+    finally:
+        os.close(new_file)
+        os.remove(filename)
+    return text
+
+
+def pdf_oxide_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
+    import pdf_oxide
+    images = []
+    new_file, filename = tempfile.mkstemp(suffix=".pdf")
+    try:
+        with open(filename, "wb") as fp:
+            fp.write(data)
+        doc = pdf_oxide.PdfDocument(filename)
+        for i in range(doc.page_count):
+            for img_index, img in enumerate(doc.extract_images(i), start=1):
+                images.append((f"page-{i+1}-image-{img_index}.{img.format}", img.data))
+    except Exception as exc:
+        print(f"pdf_oxide Image extraction failure: {exc}")
+    finally:
+        os.close(new_file)
+        os.remove(filename)
+    return images
+
+
 def tika_get_text(data: bytes) -> str:
     from tika import parser
 

diff --git a/requirements/main.in b/requirements/main.in
@@ -1,5 +1,6 @@
 borb
 numpy
+pdf-oxide
 pdfminer.six
 pdfplumber
 pypdf