Skip to content
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

### New Features (ENH)
- `pagemeta` now displays the name of a known page format that is close to the page dimensions
- `extract-text` now has an option `--output-pattern` which allows either for the full output to be stored in a single file, or for each page's text to be stored in a different file whose name depends on the output pattern and the page index

### Testing (TST)
- Added unit tests for `extract-text`

## Version 0.5.1, 2025-10-13

Expand All @@ -18,6 +21,7 @@
### Bug Fixes (BUG)
- `requests` is now a dependency, to prevent a `ModuleNotFoundError` when running with `uv`

### Testing (TST)

## Version 0.5.0, 2025-10-13

Expand Down
29 changes: 21 additions & 8 deletions docs/user/subcommand-extract-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,36 @@ $ pdfly extract-text --help

Extract text from a PDF file.


╭─ Arguments ──────────────────────────────────────────────────────────────────╮
│ * pdf FILE [default: None] [required] │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --help Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────╯
Offers an option to store the whole output in a single file, or each page's text in a different file,
allowing custom naming patterns for the output files.

┌─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ * pdf FILE [required] │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
┌─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ --output-pattern -o TEXT Naming pattern for output files. If none is entered, output is echoed. If it contains │
│ "[]" substrings, each page's text is output in a different file and the "[]" │
│ substrings in the filename are replaced by the page's index. If there are no "[]" │
│ substrings, the output is stored in one file. │
│ --help Show this message and exit. │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

```

## Examples

Extract the text from the 10th page of `document.pdf`, redirecting the output into `page.txt`.
Extract the text from the 10th page of `document.pdf`.

```
pdfly cat document.pdf 9 -o page.pdf

pdfly extract-text page.pdf

```

Extract the text from `document.pdf` and store each page's text in a file called `page-X.txt`, where X is the index of the page.

```
pdfly extract-text document.pdf -o page-[].txt

```
29 changes: 26 additions & 3 deletions pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,36 @@ def extract_text(
resolve_path=True,
),
],
output_pattern: Annotated[
str | None,
typer.Option(
"--output-pattern",
"-o",
dir_okay=False,
writable=True,
help="""Naming pattern for output files. If none is entered, output is echoed.
If it contains "[]" substrings, each page's text is output in a different file
and the "[]" substrings in the filename are replaced by the page's index.
If there are no "[]" substrings, the output is stored in one file.""",
),
] = None,
) -> None:
"""Extract text from a PDF file."""
"""Extract text from a PDF file.

Offers an option to store the whole output in a single file, or each page's text in a different file,
allowing custom naming patterns for the output files."""
from pypdf import PdfReader

reader = PdfReader(str(pdf))
for page in reader.pages:
typer.echo(page.extract_text())

if not output_pattern:
for page in reader.pages:
typer.echo(page.extract_text())
else:
for page in reader.pages:
filename = output_pattern.replace("[]", str(page.page_number))
with open(filename, "a") as file:
file.write(page.extract_text() + "\n")


@entry_point.command(name="meta", help=pdfly.metadata.__doc__) # type: ignore[misc]
Expand Down
57 changes: 57 additions & 0 deletions tests/test_extract_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from pathlib import Path

import pytest

from .conftest import RESOURCES_ROOT, chdir, run_cli


def test_extract_text_echo(
capsys: pytest.CaptureFixture, tmp_path: Path
) -> None:
with chdir(tmp_path):
run_cli(
[
"extract-text",
str(RESOURCES_ROOT / "input8.pdf"),
]
)
captured = capsys.readouterr()
assert not captured.err
assert (
"""1
2
3
4
5
6
7
8"""
in captured.out
)


def test_extract_text_with_pattern(
capsys: pytest.CaptureFixture, tmp_path: Path
) -> None:
with chdir(tmp_path):
run_cli(
[
"extract-text",
str(RESOURCES_ROOT / "input8.pdf"),
"-o",
"page-[].txt",
]
)
captured = capsys.readouterr()
assert not captured.err
files_exist = True
output_correct = True
for i in range(8):
try:
file = open(tmp_path / f"page-{i}.txt", "r")
if file.read() != f"{i+1}\n":
output_correct = False
except:
files_exist = False
assert files_exist
assert output_correct
Loading