py-pdf · papametis · Jan 19, 2026 · Jan 19, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,10 @@
 
 ### New Features (ENH)
 - `pagemeta` now displays the name of a known page format that is close to the page dimensions
+- `extract-text` now has an option `--output-pattern` which allows either for the full output to be stored in a single file, or for each page's text to be stored in a different file whose name depends on the output pattern and the page index
 
+### Testing (TST)
+- Added unit tests for `extract-text`
 
 ## Version 0.5.1, 2025-10-13
 
@@ -18,6 +21,7 @@
 ### Bug Fixes (BUG)
 - `requests` is now a dependency, to prevent a `ModuleNotFoundError` when running with `uv`
 
+### Testing (TST)
 
 ## Version 0.5.0, 2025-10-13
 

diff --git a/docs/user/subcommand-extract-text.md b/docs/user/subcommand-extract-text.md
@@ -9,23 +9,36 @@ $ pdfly extract-text --help
 
  Extract text from a PDF file.
 
-
-╭─ Arguments ──────────────────────────────────────────────────────────────────╮
-│ *    pdf      FILE  [default: None] [required]                               │
-╰──────────────────────────────────────────────────────────────────────────────╯
-╭─ Options ────────────────────────────────────────────────────────────────────╮
-│ --help          Show this message and exit.                                  │
-╰──────────────────────────────────────────────────────────────────────────────╯
+ Offers an option to store the whole output in a single file, or each page's text in a different file,
+ allowing custom naming patterns for the output files.
+
+┌─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│ *    pdf      FILE  [required]                                                                                        │
+└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+┌─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│ --output-pattern  -o      TEXT  Naming pattern for output files. If none is entered, output is echoed. If it contains │
+│                                 "[]" substrings, each page's text is output in a different file and the "[]"          │
+│                                 substrings in the filename are replaced by the page's index. If there are no "[]"     │
+│                                 substrings, the output is stored in one file.                                         │
+│ --help                          Show this message and exit.                                                           │
+└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
 
 ```
 
 ## Examples
 
-Extract the text from the 10th page of `document.pdf`, redirecting the output into `page.txt`.
+Extract the text from the 10th page of `document.pdf`.
 
 ```
 pdfly cat document.pdf 9 -o page.pdf
 
 pdfly extract-text page.pdf
 
 ```
+
+Extract the text from `document.pdf` and store each page's text in a file called `page-X.txt`, where X is the index of the page.
+
+```
+pdfly extract-text document.pdf -o page-[].txt
+
+```
diff --git a/pdfly/cli.py b/pdfly/cli.py
@@ -228,13 +228,36 @@ def extract_text(
             resolve_path=True,
         ),
     ],
+    output_pattern: Annotated[
+        str | None,
+        typer.Option(
+            "--output-pattern",
+            "-o",
+            dir_okay=False,
+            writable=True,
+            help="""Naming pattern for output files. If none is entered, output is echoed.
+If it contains "[]" substrings, each page's text is output in a different file
+and the "[]" substrings in the filename are replaced by the page's index.
+If there are no "[]" substrings, the output is stored in one file.""",
+        ),
+    ] = None,
 ) -> None:
-    """Extract text from a PDF file."""
+    """Extract text from a PDF file.
+
+    Offers an option to store the whole output in a single file, or each page's text in a different file,
+    allowing custom naming patterns for the output files."""
     from pypdf import PdfReader
 
     reader = PdfReader(str(pdf))
-    for page in reader.pages:
-        typer.echo(page.extract_text())
+
+    if not output_pattern:
+        for page in reader.pages:
+            typer.echo(page.extract_text())
+    else:
+        for page in reader.pages:
+            filename = output_pattern.replace("[]", str(page.page_number))
+            with open(filename, "a") as file:
+                file.write(page.extract_text() + "\n")
 
 
 @entry_point.command(name="meta", help=pdfly.metadata.__doc__)  # type: ignore[misc]

diff --git a/tests/test_extract_text.py b/tests/test_extract_text.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+
+import pytest
+
+from .conftest import RESOURCES_ROOT, chdir, run_cli
+
+
+def test_extract_text_echo(
+    capsys: pytest.CaptureFixture, tmp_path: Path
+) -> None:
+    with chdir(tmp_path):
+        run_cli(
+            [
+                "extract-text",
+                str(RESOURCES_ROOT / "input8.pdf"),
+            ]
+        )
+    captured = capsys.readouterr()
+    assert not captured.err
+    assert (
+        """1
+2
+3
+4
+5
+6
+7
+8"""
+        in captured.out
+    )
+
+
+def test_extract_text_with_pattern(
+    capsys: pytest.CaptureFixture, tmp_path: Path
+) -> None:
+    with chdir(tmp_path):
+        run_cli(
+            [
+                "extract-text",
+                str(RESOURCES_ROOT / "input8.pdf"),
+                "-o",
+                "page-[].txt",
+            ]
+        )
+    captured = capsys.readouterr()
+    assert not captured.err
+    files_exist = True
+    output_correct = True
+    for i in range(8):
+        try:
+            file = open(tmp_path / f"page-{i}.txt", "r")
+            if file.read() != f"{i+1}\n":
+                output_correct = False
+        except:
+            files_exist = False
+    assert files_exist
+    assert output_correct