diff --git a/CHANGELOG.md b/CHANGELOG.md index 0feb20b..5f204ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,10 @@ ### New Features (ENH) - `pagemeta` now displays the name of a known page format that is close to the page dimensions +- `extract-text` now has an option `--output-pattern` which allows either for the full output to be stored in a single file, or for each page's text to be stored in a different file whose name depends on the output pattern and the page index +### Testing (TST) +- Added unit tests for `extract-text` ## Version 0.5.1, 2025-10-13 @@ -18,6 +21,7 @@ ### Bug Fixes (BUG) - `requests` is now a dependency, to prevent a `ModuleNotFoundError` when running with `uv` +### Testing (TST) ## Version 0.5.0, 2025-10-13 diff --git a/docs/user/subcommand-extract-text.md b/docs/user/subcommand-extract-text.md index ff84c58..e569497 100644 --- a/docs/user/subcommand-extract-text.md +++ b/docs/user/subcommand-extract-text.md @@ -9,19 +9,25 @@ $ pdfly extract-text --help Extract text from a PDF file. - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ * pdf FILE [default: None] [required] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + Offers an option to store the whole output in a single file, or each page's text in a different file, + allowing custom naming patterns for the output files. + +┌─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ * pdf FILE [required] │ +└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ --output-pattern -o TEXT Naming pattern for output files. If none is entered, output is echoed. If it contains │ +│ "[]" substrings, each page's text is output in a different file and the "[]" │ +│ substrings in the filename are replaced by the page's index. If there are no "[]" │ +│ substrings, the output is stored in one file. │ +│ --help Show this message and exit. │ +└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` ## Examples -Extract the text from the 10th page of `document.pdf`, redirecting the output into `page.txt`. +Extract the text from the 10th page of `document.pdf`. ``` pdfly cat document.pdf 9 -o page.pdf @@ -29,3 +35,10 @@ pdfly cat document.pdf 9 -o page.pdf pdfly extract-text page.pdf ``` + +Extract the text from `document.pdf` and store each page's text in a file called `page-X.txt`, where X is the index of the page. + +``` +pdfly extract-text document.pdf -o page-[].txt + +``` diff --git a/pdfly/cli.py b/pdfly/cli.py index 548251e..844ab1c 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -228,13 +228,36 @@ def extract_text( resolve_path=True, ), ], + output_pattern: Annotated[ + str | None, + typer.Option( + "--output-pattern", + "-o", + dir_okay=False, + writable=True, + help="""Naming pattern for output files. If none is entered, output is echoed. +If it contains "[]" substrings, each page's text is output in a different file +and the "[]" substrings in the filename are replaced by the page's index. +If there are no "[]" substrings, the output is stored in one file.""", + ), + ] = None, ) -> None: - """Extract text from a PDF file.""" + """Extract text from a PDF file. + + Offers an option to store the whole output in a single file, or each page's text in a different file, + allowing custom naming patterns for the output files.""" from pypdf import PdfReader reader = PdfReader(str(pdf)) - for page in reader.pages: - typer.echo(page.extract_text()) + + if not output_pattern: + for page in reader.pages: + typer.echo(page.extract_text()) + else: + for page in reader.pages: + filename = output_pattern.replace("[]", str(page.page_number)) + with open(filename, "a") as file: + file.write(page.extract_text() + "\n") @entry_point.command(name="meta", help=pdfly.metadata.__doc__) # type: ignore[misc] diff --git a/tests/test_extract_text.py b/tests/test_extract_text.py new file mode 100644 index 0000000..29bb87c --- /dev/null +++ b/tests/test_extract_text.py @@ -0,0 +1,57 @@ +from pathlib import Path + +import pytest + +from .conftest import RESOURCES_ROOT, chdir, run_cli + + +def test_extract_text_echo( + capsys: pytest.CaptureFixture, tmp_path: Path +) -> None: + with chdir(tmp_path): + run_cli( + [ + "extract-text", + str(RESOURCES_ROOT / "input8.pdf"), + ] + ) + captured = capsys.readouterr() + assert not captured.err + assert ( + """1 +2 +3 +4 +5 +6 +7 +8""" + in captured.out + ) + + +def test_extract_text_with_pattern( + capsys: pytest.CaptureFixture, tmp_path: Path +) -> None: + with chdir(tmp_path): + run_cli( + [ + "extract-text", + str(RESOURCES_ROOT / "input8.pdf"), + "-o", + "page-[].txt", + ] + ) + captured = capsys.readouterr() + assert not captured.err + files_exist = True + output_correct = True + for i in range(8): + try: + file = open(tmp_path / f"page-{i}.txt", "r") + if file.read() != f"{i+1}\n": + output_correct = False + except: + files_exist = False + assert files_exist + assert output_correct