diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index c91345daa..72d8a6a3a 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -532,6 +532,9 @@ from langchain_community.document_loaders.yuque import ( YuqueLoader, ) + from langchain_community.document_loaders.ocr_pdf import ( + OCRPDFLoader, + ) _module_lookup = { @@ -732,6 +735,7 @@ "YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders", "YoutubeLoader": "langchain_community.document_loaders.youtube", "YuqueLoader": "langchain_community.document_loaders.yuque", + "OCRPDFLoader": "langchain_community.document_loaders.ocr_pdf", } @@ -940,4 +944,5 @@ def __getattr__(name: str) -> Any: "YoutubeAudioLoader", "YoutubeLoader", "YuqueLoader", + "OCRPDFLoader", ] diff --git a/libs/community/langchain_community/document_loaders/ocr_pdf.py b/libs/community/langchain_community/document_loaders/ocr_pdf.py new file mode 100644 index 000000000..7a7be4918 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/ocr_pdf.py @@ -0,0 +1,174 @@ +"""Loader for extracting text from scanned PDFs using OCR.""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Iterator, List, Optional + +from langchain_core.documents import Document +from langchain_community.document_loaders.base import BaseLoader + +logger = logging.getLogger(__name__) + + +class OCRPDFLoader(BaseLoader): + """Load scanned PDF files using OCR (Optical Character Recognition). + + This loader converts PDF pages to images and applies Tesseract OCR + to extract text from scanned documents. + + Setup: + Install required packages: + ```bash + pip install pdf2image pytesseract + ``` + + Install system dependencies: + - **Linux**: `sudo apt-get install poppler-utils tesseract-ocr` + - **macOS**: `brew install poppler tesseract` + - **Windows**: Download and install Poppler and Tesseract, add to PATH + + Example: + ```python + from langchain_community.document_loaders import OCRPDFLoader + + loader = OCRPDFLoader("scanned_document.pdf") + documents = loader.load() + + # Access extracted text and metadata + for doc in documents: + print(f"Page {doc.metadata['page']}: {doc.page_content[:100]}...") + ``` + """ + + def __init__( + self, + file_path: str | Path, + *, + tesseract_config: str = "", + poppler_path: Optional[str] = None, + first_page: Optional[int] = None, + last_page: Optional[int] = None, + dpi: int = 200, + fmt: str = "JPEG", + ) -> None: + """Initialize the OCR PDF loader. + + Args: + file_path: Path to the PDF file to load. + tesseract_config: Additional configuration options for Tesseract OCR. + Example: "--psm 6" for uniform text blocks. + poppler_path: Path to poppler installation (Windows only). + first_page: First page to process (1-indexed). If None, starts from page 1. + last_page: Last page to process (1-indexed). If None, processes all pages. + dpi: Resolution for PDF to image conversion. Higher values improve + OCR accuracy but increase processing time. + fmt: Image format for conversion ("JPEG", "PNG", etc.). + + Raises: + FileNotFoundError: If the specified PDF file does not exist. + ImportError: If required dependencies are not installed. + """ + try: + import pdf2image # noqa: F401 + import pytesseract # noqa: F401 + except ImportError as e: + raise ImportError( + "OCRPDFLoader requires pdf2image and pytesseract. " + "Install with: pip install pdf2image pytesseract" + ) from e + + self.file_path = Path(file_path) + if not self.file_path.exists(): + raise FileNotFoundError(f"PDF file not found: {self.file_path}") + + self.tesseract_config = tesseract_config + self.poppler_path = poppler_path + self.first_page = first_page + self.last_page = last_page + self.dpi = dpi + self.fmt = fmt + + def load(self) -> List[Document]: + """Load all pages and return as a list of Documents. + + Returns: + List of Document objects, one per page with extracted text. + """ + return list(self.lazy_load()) + + def lazy_load(self) -> Iterator[Document]: + """Lazy load pages one at a time. + + Yields: + Document objects with extracted text and metadata. + + Raises: + Exception: If PDF processing or OCR fails. + """ + try: + from pdf2image import convert_from_path + import pytesseract + except ImportError as e: + raise ImportError( + "Required dependencies not found. " + "Install with: pip install pdf2image pytesseract" + ) from e + + try: + # Convert PDF pages to images + conversion_kwargs = { + "pdf_path": self.file_path, + "dpi": self.dpi, + "fmt": self.fmt, + } + + if self.poppler_path: + conversion_kwargs["poppler_path"] = self.poppler_path + if self.first_page: + conversion_kwargs["first_page"] = self.first_page + if self.last_page: + conversion_kwargs["last_page"] = self.last_page + + pages = convert_from_path(**conversion_kwargs) + total_pages = len(pages) + + logger.info(f"Processing {total_pages} pages from {self.file_path}") + + except Exception as e: + raise RuntimeError(f"Failed to convert PDF to images: {e}") from e + + # Process each page with OCR + start_page = self.first_page or 1 + + for i, page_image in enumerate(pages): + page_number = start_page + i + + try: + # Extract text using Tesseract OCR + ocr_kwargs = {"image": page_image} + if self.tesseract_config: + ocr_kwargs["config"] = self.tesseract_config + + text = pytesseract.image_to_string(**ocr_kwargs) + + # Only yield documents with non-empty text + if text.strip(): + yield Document( + page_content=text.strip(), + metadata={ + "source": str(self.file_path), + "page": page_number, + "total_pages": total_pages, + "loader": "OCRPDFLoader", + "ocr_engine": "tesseract", + }, + ) + else: + logger.warning(f"No text extracted from page {page_number}") + + except Exception as e: + logger.error(f"OCR failed for page {page_number}: {e}") + # Continue processing other pages even if one fails + continue diff --git a/libs/community/tests/unit_tests/document_loaders/test_ocr_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_ocr_pdf.py new file mode 100644 index 000000000..b2798c7fd --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_ocr_pdf.py @@ -0,0 +1,281 @@ +"""Tests for OCR PDF Loader.""" + +import pytest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from langchain_core.documents import Document +from langchain_community.document_loaders.ocr_pdf import OCRPDFLoader + + +class TestOCRPDFLoader: + """Test suite for OCRPDFLoader.""" + + def test_initialization_with_valid_path(self, tmp_path: Path) -> None: + """Test loader initialization with valid file path.""" + # Create a temporary PDF file + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + loader = OCRPDFLoader(file_path=str(pdf_file)) + + assert loader.file_path == pdf_file + assert loader.tesseract_config == "" + assert loader.dpi == 200 + assert loader.fmt == "JPEG" + + def test_initialization_with_custom_params(self, tmp_path: Path) -> None: + """Test loader initialization with custom parameters.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + loader = OCRPDFLoader( + file_path=str(pdf_file), + tesseract_config="--psm 6", + dpi=300, + fmt="PNG", + first_page=2, + last_page=5, + ) + + assert loader.tesseract_config == "--psm 6" + assert loader.dpi == 300 + assert loader.fmt == "PNG" + assert loader.first_page == 2 + assert loader.last_page == 5 + + def test_initialization_file_not_found(self) -> None: + """Test loader initialization with non-existent file.""" + with pytest.raises(FileNotFoundError, match="PDF file not found"): + OCRPDFLoader("nonexistent.pdf") + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_missing_dependencies( + self, mock_pytesseract, mock_pdf2image, tmp_path: Path + ) -> None: + """Test error handling when dependencies are missing.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + # Mock the import to raise ImportError + with patch.dict("sys.modules", {"pdf2image": None, "pytesseract": None}): + with patch( + "builtins.__import__", side_effect=ImportError("pdf2image not found") + ): + with pytest.raises( + ImportError, match="OCRPDFLoader requires pdf2image" + ): + OCRPDFLoader(str(pdf_file)) + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_load_with_mocked_ocr(self, mock_ocr, mock_convert, tmp_path: Path) -> None: + """Test load() method with mocked OCR results.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + # Mock dependencies + mock_pages = [MagicMock(), MagicMock(), MagicMock()] + mock_convert.return_value = mock_pages + mock_ocr.side_effect = [ + "Text from page 1", + "Text from page 2", + "", # Empty text (should be skipped) + ] + + loader = OCRPDFLoader(str(pdf_file)) + documents = loader.load() + + # Should only return 2 documents (skipping empty text) + assert len(documents) == 2 + + # Check first document + assert isinstance(documents[0], Document) + assert documents[0].page_content == "Text from page 1" + assert documents[0].metadata == { + "source": str(pdf_file), + "page": 1, + "total_pages": 3, + "loader": "OCRPDFLoader", + "ocr_engine": "tesseract", + } + + # Check second document + assert documents[1].page_content == "Text from page 2" + assert documents[1].metadata["page"] == 2 + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_lazy_load_with_mocked_ocr( + self, mock_ocr, mock_convert, tmp_path: Path + ) -> None: + """Test lazy_load() method with mocked OCR results.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_pages = [MagicMock(), MagicMock()] + mock_convert.return_value = mock_pages + mock_ocr.side_effect = ["Page 1 content", "Page 2 content"] + + loader = OCRPDFLoader(str(pdf_file)) + documents = list(loader.lazy_load()) + + assert len(documents) == 2 + assert all(isinstance(doc, Document) for doc in documents) + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_load_with_tesseract_config( + self, mock_ocr, mock_convert, tmp_path: Path + ) -> None: + """Test OCR with custom Tesseract configuration.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_page = MagicMock() + mock_convert.return_value = [mock_page] + mock_ocr.return_value = "OCR result" + + loader = OCRPDFLoader(str(pdf_file), tesseract_config="--psm 6") + list(loader.lazy_load()) + + # Verify OCR was called with config + mock_ocr.assert_called_once_with(image=mock_page, config="--psm 6") + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_load_with_page_range(self, mock_ocr, mock_convert, tmp_path: Path) -> None: + """Test loading with specific page range.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_pages = [MagicMock(), MagicMock()] + mock_convert.return_value = mock_pages + mock_ocr.side_effect = ["Page 3 content", "Page 4 content"] + + loader = OCRPDFLoader(str(pdf_file), first_page=3, last_page=4) + documents = loader.load() + + # Check that convert_from_path was called with page range + mock_convert.assert_called_once() + call_kwargs = mock_convert.call_args[1] + assert call_kwargs["first_page"] == 3 + assert call_kwargs["last_page"] == 4 + + # Check document metadata reflects correct page numbers + assert documents[0].metadata["page"] == 3 + assert documents[1].metadata["page"] == 4 + + @patch("pdf2image.convert_from_path") + def test_conversion_error_handling(self, mock_convert, tmp_path: Path) -> None: + """Test error handling during PDF to image conversion.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_convert.side_effect = RuntimeError("Conversion failed") + + loader = OCRPDFLoader(str(pdf_file)) + + with pytest.raises(RuntimeError, match="Failed to convert PDF to images"): + list(loader.lazy_load()) + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_ocr_error_handling( + self, mock_ocr, mock_convert, tmp_path: Path, caplog + ) -> None: + """Test error handling during OCR processing.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_pages = [MagicMock(), MagicMock()] + mock_convert.return_value = mock_pages + mock_ocr.side_effect = [ + "Successful OCR", + RuntimeError("OCR failed"), + ] + + loader = OCRPDFLoader(str(pdf_file)) + documents = loader.load() + + # Should return only the successful document + assert len(documents) == 1 + assert documents[0].page_content == "Successful OCR" + + # Should log error for failed page + assert "OCR failed for page 2" in caplog.text + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_empty_text_filtering( + self, mock_ocr, mock_convert, tmp_path: Path, caplog + ) -> None: + """Test that pages with empty OCR results are filtered out.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_pages = [MagicMock(), MagicMock()] + mock_convert.return_value = mock_pages + mock_ocr.side_effect = [ + "Good content", + " \n\t ", + ] # Second is whitespace only + + loader = OCRPDFLoader(str(pdf_file)) + documents = loader.load() + + # Should only return document with actual content + assert len(documents) == 1 + assert documents[0].page_content == "Good content" + + # Should log warning for empty page + assert "No text extracted from page 2" in caplog.text + + def test_pathlib_path_support(self, tmp_path: Path) -> None: + """Test that loader accepts pathlib.Path objects.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + # Should accept Path object directly + loader = OCRPDFLoader(pdf_file) + assert loader.file_path == pdf_file + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_poppler_path_parameter( + self, mock_ocr, mock_convert, tmp_path: Path + ) -> None: + """Test that poppler_path is passed to convert_from_path.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_convert.return_value = [MagicMock()] + mock_ocr.return_value = "Test content" + + loader = OCRPDFLoader(str(pdf_file), poppler_path="/custom/poppler/path") + list(loader.lazy_load()) + + # Verify poppler_path was passed + call_kwargs = mock_convert.call_args[1] + assert call_kwargs["poppler_path"] == "/custom/poppler/path" + + @patch("pdf2image.convert_from_path") + @patch("pytesseract.image_to_string") + def test_dpi_and_format_parameters( + self, mock_ocr, mock_convert, tmp_path: Path + ) -> None: + """Test that DPI and format parameters are passed correctly.""" + pdf_file = tmp_path / "test.pdf" + pdf_file.write_bytes(b"fake pdf content") + + mock_convert.return_value = [MagicMock()] + mock_ocr.return_value = "Test content" + + loader = OCRPDFLoader(str(pdf_file), dpi=300, fmt="PNG") + list(loader.lazy_load()) + + # Verify parameters were passed + call_kwargs = mock_convert.call_args[1] + assert call_kwargs["dpi"] == 300 + assert call_kwargs["fmt"] == "PNG"