Skip to content

Commit

Permalink
Use pypdfium2 for PDF rasterizing when possible
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval authored Jun 20, 2024
2 parents 3d9ee20 + e16cade commit e830731
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 41 deletions.
93 changes: 93 additions & 0 deletions .github/workflows/lambda_layers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -379,3 +379,96 @@ jobs:
with:
name: textractor-lambda-p312-pdf
path: lambda_layer/
lambda-build-p38-pdfium:
runs-on: ubuntu-latest
container: public.ecr.aws/sam/build-python3.8:latest
steps:
- uses: actions/checkout@v3
with:
ref: 'master'
- name: Install and build
run: |
mkdir -p lambda_layer/python && \
cd lambda_layer/python && \
pip3 install ../../ --target=. && \
pip3 install pypdfium2 --target=.
- uses: actions/upload-artifact@v3
with:
name: textractor-lambda-p38-pdfium
path: lambda_layer/
lambda-build-p39-pdfium:
runs-on: ubuntu-latest
container: public.ecr.aws/sam/build-python3.9:latest
steps:
- uses: actions/checkout@v3
with:
ref: 'master'
- name: Install and build
run: |
mkdir -p lambda_layer/python && \
cd lambda_layer/python && \
pip3 install ../../ --target=. && \
pip3 install pypdfium2 --target=. && \
cd .. \
&& zip -r /textractor.zip python/
- uses: actions/upload-artifact@v3
with:
name: textractor-lambda-p39-pdfium
path: lambda_layer/
lambda-build-p310-pdfium:
runs-on: ubuntu-latest
container: public.ecr.aws/sam/build-python3.10:latest
steps:
- uses: actions/checkout@v3
with:
ref: 'master'
- name: Install and build
run: |
mkdir -p lambda_layer/python && \
cd lambda_layer/python && \
pip3 install ../../ --target=. && \
pip3 install pypdfium2 --target=. && \
cd .. \
&& zip -r /textractor.zip python/
- uses: actions/upload-artifact@v3
with:
name: textractor-lambda-p310-pdfium
path: lambda_layer/
lambda-build-p311-pdfium:
runs-on: ubuntu-latest
container: public.ecr.aws/sam/build-python3.11:latest
steps:
- uses: actions/checkout@v3
with:
ref: 'master'
- name: Install and build
run: |
mkdir -p lambda_layer/python && \
cd lambda_layer/python && \
pip3 install ../../ --target=. && \
pip3 install pypdfium2 --target=. && \
cd .. \
&& zip -r /textractor.zip python/
- uses: actions/upload-artifact@v3
with:
name: textractor-lambda-p311-pdfium
path: lambda_layer/
lambda-build-p312-pdfium:
runs-on: ubuntu-latest
container: public.ecr.aws/sam/build-python3.12:latest
steps:
- uses: actions/checkout@v3
with:
ref: 'master'
- name: Install and build
run: |
mkdir -p lambda_layer/python && \
cd lambda_layer/python && \
pip3 install ../../ --target=. && \
pip3 install pypdfium2 --target=. && \
cd .. \
&& zip -r /textractor.zip python/
- uses: actions/upload-artifact@v3
with:
name: textractor-lambda-p312-pdfium
path: lambda_layer/
1 change: 1 addition & 0 deletions extras/pdfium.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pypdfium2
5 changes: 5 additions & 0 deletions textractor/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,8 @@ class UnhandledCaseException(Exception):
"""Raised when no statement matched the condition"""

pass

class UnsupportedDocumentException(Exception):
"""Raised by the Textract API when the document could not be processed"""

pass
83 changes: 42 additions & 41 deletions textractor/textractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,15 @@
from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json

try:
from pdf2image import convert_from_bytes, convert_from_path
try:
import pypdfium2
except:
import pdf2image

IS_PDF2IMAGE_INSTALLED = True
IS_PDF_RENDERING_ENABLED = True
except ImportError:
IS_PDF2IMAGE_INSTALLED = False
logging.info("pdf2image is not installed, client-side PDF rasterizing is disabled")
IS_PDF_RENDERING_ENABLED = False
logging.info("pypdfium2 and pdf2image are both not installed, client-side PDF rasterizing is disabled")

from textractor.data.constants import (
TextractAPI,
Expand All @@ -51,12 +54,14 @@
from textractor.entities.lazy_document import LazyDocument
from textractor.parsers import response_parser
from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
from textractor.utils.pdf_utils import rasterize_pdf
from textractor.exceptions import (
InputError,
RegionMismatchError,
IncorrectMethodException,
MissingDependencyException,
UnhandledCaseException,
UnsupportedDocumentException,
)


Expand Down Expand Up @@ -129,8 +134,8 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
)
file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
if filepath.lower().endswith(".pdf"):
if IS_PDF2IMAGE_INSTALLED:
images = convert_from_bytes(bytearray(file_obj))
if IS_PDF_RENDERING_ENABLED:
images = rasterize_pdf(bytearray(file_obj))
else:
raise MissingDependencyException(
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
Expand All @@ -140,8 +145,8 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:

else:
if filepath.lower().endswith(".pdf"):
if IS_PDF2IMAGE_INSTALLED:
images = convert_from_path(filepath)
if IS_PDF_RENDERING_ENABLED:
images = rasterize_pdf(filepath)
else:
raise MissingDependencyException(
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
Expand All @@ -155,16 +160,14 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
return images

def detect_document_text(
self, file_source, s3_output_path: str = "", save_image: bool = True
self, file_source, save_image: bool = True
) -> Document:
"""
Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.
This function is ideal for single page PDFs or single images.
:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
:type file_source: str or PIL.Image, required
:param s3_output_path: S3 path to store the output.
:type s3_output_path: str, optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
and necessary only if the customer wants to visualize bounding boxes for their document entities.
:type save_image: bool
Expand All @@ -181,12 +184,15 @@ def detect_document_text(

elif isinstance(file_source, str):
logging.debug("Filepath given.")
images = self._get_document_images_from_path(file_source)
if len(images) > 1:
raise IncorrectMethodException(
"Input contains more than 1 page. Call start_document_text_detection instead."
)
file_source = _image_to_byte_array(images[0])
if not save_image and file_source.lower().endswith(".pdf"):
images = []
else:
images = self._get_document_images_from_path(file_source)
if len(images) > 1:
raise IncorrectMethodException(
"Input contains more than 1 page. Call start_document_analysis() instead."
)
file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):
logging.debug("PIL Image given.")
Expand All @@ -202,18 +208,12 @@ def detect_document_text(
images = []
raise InputError("Input file_source format not supported.")

if not s3_output_path:
output_config = None
else:
bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

try:
response = call_textract(
input_document=file_source,
features=[],
queries_config=None, # not supported yet
output_config=output_config,
output_config=None,
kms_key_id=self.kms_key_id,
job_tag="",
notification_channel=None, # not supported yet
Expand All @@ -229,6 +229,10 @@ def detect_document_text(
raise RegionMismatchError(
"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
)
elif exception.__class__.__name__ == "UnsupportedDocumentException":
raise UnsupportedDocumentException(
"Textract returned an UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_text_detection. If your file_source is an image, make sure that it is not larger than 5MB."
)
raise exception

document = response_parser.parse(response)
Expand Down Expand Up @@ -343,7 +347,6 @@ def analyze_document(
file_source,
features,
queries: Union[QueriesConfig, List[Query], List[str]] = None,
s3_output_path: str = "",
save_image: bool = True,
) -> Document:
"""
Expand All @@ -356,8 +359,6 @@ def analyze_document(
:type features: list, required
:param queries: Queries to run on the document
:type features: Union[QueriesConfig, List[Query], List[str]]
:param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
:type s3_output_path: str, optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
and necessary only if the customer wants to visualize bounding boxes for their document entities.
:type save_image: bool
Expand All @@ -373,13 +374,15 @@ def analyze_document(

elif isinstance(file_source, str):
logging.debug("Filepath given.")
images = self._get_document_images_from_path(file_source)
if len(images) > 1:
raise IncorrectMethodException(
"Input contains more than 1 page. Call start_document_analysis() instead."
)
file_source = _image_to_byte_array(images[0])

if not save_image and file_source.lower().endswith(".pdf"):
images = []
else:
images = self._get_document_images_from_path(file_source)
if len(images) > 1:
raise IncorrectMethodException(
"Input contains more than 1 page. Call start_document_analysis() instead."
)
file_source = _image_to_byte_array(images[0])
elif isinstance(file_source, Image.Image):
logging.debug("PIL Image given.")
images = [file_source]
Expand All @@ -394,12 +397,6 @@ def analyze_document(
images = []
raise InputError("Input file_source format not supported.")

if not s3_output_path:
output_config = None
else:
bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

if not isinstance(features, list):
features = [features]

Expand Down Expand Up @@ -429,7 +426,7 @@ def analyze_document(
input_document=file_source,
features=features,
queries_config=queries, # not supported yet
output_config=output_config,
output_config=None,
kms_key_id=self.kms_key_id,
job_tag="",
notification_channel=None, # not supported yet
Expand All @@ -445,6 +442,10 @@ def analyze_document(
raise RegionMismatchError(
"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
)
elif exception.__class__.__name__ == "UnsupportedDocumentException":
raise UnsupportedDocumentException(
"Textract returned an UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_analysis. If your file_source is an image, make sure that it is not larger than 5MB."
)
raise exception

document = response_parser.parse(response)
Expand Down
33 changes: 33 additions & 0 deletions textractor/utils/pdf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import re
from typing import List, Union
from PIL import Image

try:
import pypdfium2
PYPDFIUM2_IS_INSTALLED = True
except ImportError:
PYPDFIUM2_IS_INSTALLED = False

try:
from pdf2image import convert_from_bytes, convert_from_path, pdfinfo_from_bytes, pdfinfo_from_path
PDF2IMAGE_IS_INSTALLED = True
except ImportError:
PDF2IMAGE_IS_INSTALLED = False


def rasterize_pdf(pdf: Union[str, bytes]) -> List[Image.Image]:
"""
Convert a pdf into a list of images
"""
if PYPDFIUM2_IS_INSTALLED:
pdf = pypdfium2.PdfDocument(pdf)
return [page.render(scale=250 / 72).to_pil() for page in pdf]
elif PDF2IMAGE_IS_INSTALLED:
if isinstance(pdf, str):
return convert_from_path(pdf, dpi=250, fmt="jpeg")
elif isinstance(pdf, bytes):
return convert_from_bytes(pdf, dpi=250, fmt="jpeg")
else:
raise Exception(f"{type(pdf)} is not a supported type, should be str or bytes")
else:
raise Exception("PDF rasterization is not possible if neither pypdfium2 nor pdf2image are installed")

0 comments on commit e830731

Please sign in to comment.