From cffe1f0ae57f392a1ead27492198cc07df222ca4 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Thu, 14 Aug 2025 16:25:59 +0200 Subject: [PATCH] Adding feature to import drawingml objects in doclingdocument Signed-off-by: Rafael Teixeira de Lima --- docling/backend/docx/drawingml/utils.py | 133 ++++++++++++++++++++++++ docling/backend/msword_backend.py | 86 +++++++++++++-- 2 files changed, 208 insertions(+), 11 deletions(-) create mode 100644 docling/backend/docx/drawingml/utils.py diff --git a/docling/backend/docx/drawingml/utils.py b/docling/backend/docx/drawingml/utils.py new file mode 100644 index 000000000..4c95fa4e4 --- /dev/null +++ b/docling/backend/docx/drawingml/utils.py @@ -0,0 +1,133 @@ +import os +import shutil +import subprocess +from pathlib import Path +from tempfile import mkdtemp +from typing import Callable, Optional + +import pypdfium2 +from docx.document import Document +from PIL import Image, ImageChops + + +def get_docx_to_pdf_converter() -> Optional[Callable]: + """ + Detects the best available DOCX to PDF tool and returns a conversion function. + The returned function accepts (input_path, output_path). + Returns None if no tool is available. + """ + + # Try LibreOffice + libreoffice_cmd = shutil.which("libreoffice") or shutil.which("soffice") + if libreoffice_cmd: + + def convert_with_libreoffice(input_path, output_path): + subprocess.run( + [ + libreoffice_cmd, + "--headless", + "--convert-to", + "pdf", + "--outdir", + os.path.dirname(output_path), + input_path, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + + expected_output = os.path.join( + os.path.dirname(output_path), + os.path.splitext(os.path.basename(input_path))[0] + ".pdf", + ) + if expected_output != output_path: + os.rename(expected_output, output_path) + + return convert_with_libreoffice + + # Try docx2pdf (MS Word required) + try: + import docx2pdf # type: ignore + + def convert_with_docx2pdf(input_path, output_path): + from docx2pdf import convert # type: ignore + + convert(input_path, os.path.dirname(output_path)) + + # Move result if necessary + expected_output = os.path.join( + os.path.dirname(output_path), + os.path.splitext(os.path.basename(input_path))[0] + ".pdf", + ) + if expected_output != output_path: + os.rename(expected_output, output_path) + + return convert_with_docx2pdf + except ImportError: + pass + + # Try Pandoc + try: + import pypandoc # type: ignore + + if shutil.which("pandoc"): + + def convert_with_pandoc(input_path, output_path): + import pypandoc # type: ignore + + pypandoc.convert_file(input_path, "pdf", outputfile=output_path) + + return convert_with_pandoc + except ImportError: + pass + + # No tools found + return None + + +def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image: + if bg_color is None: + bg_color = image.getpixel((0, 0)) + + bg = Image.new(image.mode, image.size, bg_color) + diff = ImageChops.difference(image, bg) + bbox = diff.getbbox() + + if bbox: + left, upper, right, lower = bbox + left = max(0, left - padding) + upper = max(0, upper - padding) + right = min(image.width, right + padding) + lower = min(image.height, lower + padding) + return image.crop((left, upper, right, lower)) + else: + return image + + +def get_pil_from_dml_docx( + docx: Document, converter: Optional[Callable] +) -> Optional[Image.Image]: + if converter is None: + return None + + temp_dir = Path(mkdtemp()) + temp_docx = Path(temp_dir / "drawing_only.docx") + temp_pdf = Path(temp_dir / "drawing_only.pdf") + + # 1) Save docx temporarily + docx.save(str(temp_docx)) + + # 2) Export to PDF + converter(temp_docx, temp_pdf) + + # 3) Load PDF as PNG + pdf = pypdfium2.PdfDocument(temp_pdf) + page = pdf[0] + image = crop_whitespace(page.render(scale=2).to_pil()) + page.close() + pdf.close() + + shutil.rmtree(temp_dir, ignore_errors=True) + + return image diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 45c53a984..331f129f1 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -1,5 +1,6 @@ import logging import re +from copy import deepcopy from io import BytesIO from pathlib import Path from typing import Any, List, Optional, Union @@ -30,6 +31,10 @@ from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.docx.drawingml.utils import ( + get_docx_to_pdf_converter, + get_pil_from_dml_docx, +) from docling.backend.docx.latex.omml import oMath2Latex from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -61,6 +66,7 @@ def __init__( self.equation_bookends: str = "{EQ}" # Track processed textbox elements to avoid duplication self.processed_textbox_elements: List[int] = [] + self.docx_to_pdf_converter = get_docx_to_pdf_converter() for i in range(-1, self.max_levels): self.parents[i] = None @@ -75,18 +81,11 @@ def __init__( "indents": [None], } - self.docx_obj = None - try: - if isinstance(self.path_or_stream, BytesIO): - self.docx_obj = Document(self.path_or_stream) - elif isinstance(self.path_or_stream, Path): - self.docx_obj = Document(str(self.path_or_stream)) - + self.docx_obj = self.load_msword_file( + path_or_stream=self.path_or_stream, document_hash=self.document_hash + ) + if self.docx_obj: self.valid = True - except Exception as e: - raise RuntimeError( - f"MsWordDocumentBackend could not load document with hash {self.document_hash}" - ) from e @override def is_valid(self) -> bool: @@ -133,6 +132,22 @@ def convert(self) -> DoclingDocument: f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) + @staticmethod + def load_msword_file( + path_or_stream: Union[BytesIO, Path], document_hash: str + ) -> DocxDocument: + try: + if isinstance(path_or_stream, BytesIO): + return Document(path_or_stream) + elif isinstance(path_or_stream, Path): + return Document(str(path_or_stream)) + else: + return None + except Exception as e: + raise RuntimeError( + f"MsWordDocumentBackend could not load document with hash {document_hash}" + ) from e + def _update_history( self, name: str, @@ -187,6 +202,7 @@ def _walk_linear( } xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces) drawing_blip = xpath_expr(element) + drawingml_els = element.findall(".//w:drawing", namespaces=namespaces) # Check for textbox content - check multiple textbox formats # Only process if the element hasn't been processed before @@ -261,6 +277,18 @@ def _walk_linear( and element.find(".//w:t", namespaces=namespaces) is not None ): self._handle_text_elements(element, docx_obj, doc) + # Check for DrawingML elements + elif drawingml_els: + if self.docx_to_pdf_converter is None: + _log.warning( + "Found DrawingML elements in document, but no DOCX to PDF converters. " + "If you want these exported, make sure you have " + "LibreOffice (make sure its binary is in PATH) [Preferred], " + "Word+docx2pdf, " + "or pypandoc installed." + ) + else: + self._handle_drawingml(doc=doc, drawingml_els=drawingml_els) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -1170,3 +1198,39 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]: caption=None, ) return + + def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any): + # 1) Make an empty copy of the original document + dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash) + body = dml_doc._element.body + for child in list(body): + body.remove(child) + + # 2) Add DrawingML to empty document + new_para = dml_doc.add_paragraph() + new_r = new_para.add_run() + for dml in drawingml_els: + new_r._r.append(deepcopy(dml)) + + # 3) Export DOCX->PDF->PNG and save it in DoclingDocument + level = self._get_level() + try: + pil_image = get_pil_from_dml_docx( + dml_doc, converter=self.docx_to_pdf_converter + ) + if pil_image is None: + raise UnidentifiedImageError + + doc.add_picture( + parent=self.parents[level - 1], + image=ImageRef.from_pil(image=pil_image, dpi=72), + caption=None, + ) + except (UnidentifiedImageError, OSError): + _log.warning("Warning: DrawingML image cannot be loaded by Pillow") + doc.add_picture( + parent=self.parents[level - 1], + caption=None, + ) + + return