Skip to content

Commit

Permalink
Implement uncompress functionality for PDF files (#75)
Browse files Browse the repository at this point in the history
Co-authored-by: Lucas Cimon <[email protected]>
  • Loading branch information
Kaos599 and Lucas-C authored Nov 15, 2024
1 parent 32f68fd commit bff4b8e
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 8 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ $ pdfly --help
│ 2-up Create a booklet-style PDF from a single input. │
│ cat Concatenate pages from PDF files into a single PDF file. │
│ compress Compress a PDF. │
| uncompress Uncompresses a PDF. │
│ extract-images Extract images from PDF without resampling or altering. │
│ extract-text Extract text from a PDF file. │
│ meta Show metadata of a PDF file │
Expand Down
25 changes: 25 additions & 0 deletions pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pdfly.metadata
import pdfly.pagemeta
import pdfly.rm
import pdfly.uncompress
import pdfly.up2
import pdfly.update_offsets
import pdfly.x2pdf
Expand Down Expand Up @@ -205,6 +206,30 @@ def compress(
pdfly.compress.main(pdf, output)


@entry_point.command(name="uncompress", help=pdfly.uncompress.__doc__) # type: ignore[misc]
def uncompress(
pdf: Annotated[
Path,
typer.Argument(
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
),
],
output: Annotated[
Path,
typer.Argument(
exists=False,
writable=True,
),
],
) -> None:
pdfly.uncompress.main(pdf, output)


@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc]
def update_offsets(
file_in: Annotated[
Expand Down
52 changes: 52 additions & 0 deletions pdfly/uncompress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Module for uncompressing PDF content streams."""

import zlib
from pathlib import Path
from typing import Optional

from pypdf import PdfReader, PdfWriter
from pypdf.generic import IndirectObject, PdfObject


def main(pdf: Path, output: Path) -> None:
reader = PdfReader(pdf)
writer = PdfWriter()

for page in reader.pages:
if "/Contents" in page:
contents: Optional[PdfObject] = page["/Contents"]
if isinstance(contents, IndirectObject):
contents = contents.get_object()
if contents is not None:
if isinstance(contents, list):
for content in contents:
if isinstance(content, IndirectObject):
decompress_content_stream(content)
elif isinstance(contents, IndirectObject):
decompress_content_stream(contents)
writer.add_page(page)

with open(output, "wb") as fp:
writer.write(fp)

orig_size = pdf.stat().st_size
uncomp_size = output.stat().st_size

print(f"Original Size : {orig_size:,}")
print(
f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)"
)


def decompress_content_stream(content: IndirectObject) -> None:
"""Decompress a content stream if it uses FlateDecode."""
if content.get("/Filter") == "/FlateDecode":
try:
compressed_data = content.get_data()
uncompressed_data = zlib.decompress(compressed_data)
content.set_data(uncompressed_data)
del content["/Filter"]
except zlib.error as error:
print(
f"Some content stream with /FlateDecode failed to be decompressed: {error}"
)
9 changes: 5 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import os
from pathlib import Path

from fpdf import FPDF
import pytest
from fpdf import FPDF

from pdfly.cli import entry_point

try:
Expand Down Expand Up @@ -35,7 +36,7 @@ def run_cli(args):
return error.code


@pytest.fixture
@pytest.fixture()
def two_pages_pdf_filepath(tmp_path):
"A PDF with 2 pages, and a different image on each page"
# Note: prior to v2.7.9, fpdf2 produced incorrect /Resources dicts for each page (cf. fpdf2 PR #1133),
Expand All @@ -50,7 +51,7 @@ def two_pages_pdf_filepath(tmp_path):
return pdf_filepath


@pytest.fixture
@pytest.fixture()
def pdf_file_100(tmp_path):
"""A PDF with 100 pages; each has only the page index on it."""
pdf = FPDF()
Expand All @@ -65,7 +66,7 @@ def pdf_file_100(tmp_path):
return pdf_filepath


@pytest.fixture
@pytest.fixture()
def pdf_file_abc(tmp_path):
"""A PDF with 100 pages; each has only the page index on it."""
pdf = FPDF()
Expand Down
2 changes: 0 additions & 2 deletions tests/test_extract_images.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pytest

from .conftest import RESOURCES_ROOT, chdir, run_cli


Expand Down
40 changes: 40 additions & 0 deletions tests/test_uncompress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Tests for the `uncompress` command."""

from pathlib import Path

import pytest
from pypdf import PdfReader
from typer.testing import CliRunner

from pdfly.cli import entry_point

runner = CliRunner()


@pytest.mark.parametrize(
"input_pdf_filepath", Path("sample-files").glob("*.pdf")
)
def test_uncompress_all_sample_files(
input_pdf_filepath: Path, tmp_path: Path
) -> None:
output_pdf_filepath = tmp_path / "uncompressed_output.pdf"

result = runner.invoke(
entry_point,
["uncompress", str(input_pdf_filepath), str(output_pdf_filepath)],
)

assert (
result.exit_code == 0
), f"Error in uncompressing {input_pdf_filepath}: {result.output}"
assert (
output_pdf_filepath.exists()
), f"Output PDF {output_pdf_filepath} does not exist."

reader = PdfReader(str(output_pdf_filepath))
for page in reader.pages:
contents = page.get("/Contents")
if contents:
assert (
"/Filter" not in contents
), "Content stream is still compressed"
4 changes: 2 additions & 2 deletions tests/test_update_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
Here should only be end-to-end tests.
"""

import re
from pathlib import Path

import pytest
import re

from .conftest import RESOURCES_ROOT, chdir, run_cli
from .conftest import RESOURCES_ROOT, run_cli


def test_update_offsets(capsys, tmp_path: Path) -> None:
Expand Down

0 comments on commit bff4b8e

Please sign in to comment.