Use pypdfium2 for PDF rasterizing when possible

aws-samples · Jun 20, 2024 · e830731 · e830731
2 parents 3d9ee20 + e16cade
commit e830731
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 41 deletions.
diff --git a/.github/workflows/lambda_layers.yml b/.github/workflows/lambda_layers.yml
@@ -379,3 +379,96 @@ jobs:
         with:
           name: textractor-lambda-p312-pdf
           path: lambda_layer/
+  lambda-build-p38-pdfium:
+    runs-on: ubuntu-latest
+    container: public.ecr.aws/sam/build-python3.8:latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: 'master'
+      - name: Install and build
+        run: |
+          mkdir -p lambda_layer/python && \
+          cd lambda_layer/python && \
+          pip3 install ../../ --target=. && \
+          pip3 install pypdfium2 --target=.
+      - uses: actions/upload-artifact@v3
+        with:
+          name: textractor-lambda-p38-pdfium
+          path: lambda_layer/
+  lambda-build-p39-pdfium:
+    runs-on: ubuntu-latest
+    container: public.ecr.aws/sam/build-python3.9:latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: 'master'
+      - name: Install and build
+        run: |
+          mkdir -p lambda_layer/python && \
+          cd lambda_layer/python && \
+          pip3 install ../../ --target=. && \
+          pip3 install pypdfium2 --target=. && \
+          cd .. \
+          && zip -r /textractor.zip python/
+      - uses: actions/upload-artifact@v3
+        with:
+          name: textractor-lambda-p39-pdfium
+          path: lambda_layer/
+  lambda-build-p310-pdfium:
+    runs-on: ubuntu-latest
+    container: public.ecr.aws/sam/build-python3.10:latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: 'master'
+      - name: Install and build
+        run: |
+          mkdir -p lambda_layer/python && \
+          cd lambda_layer/python && \
+          pip3 install ../../ --target=. && \
+          pip3 install pypdfium2 --target=. && \
+          cd .. \
+          && zip -r /textractor.zip python/
+      - uses: actions/upload-artifact@v3
+        with:
+          name: textractor-lambda-p310-pdfium
+          path: lambda_layer/
+  lambda-build-p311-pdfium:
+    runs-on: ubuntu-latest
+    container: public.ecr.aws/sam/build-python3.11:latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: 'master'
+      - name: Install and build
+        run: |
+          mkdir -p lambda_layer/python && \
+          cd lambda_layer/python && \
+          pip3 install ../../ --target=. && \
+          pip3 install pypdfium2 --target=. && \
+          cd .. \
+          && zip -r /textractor.zip python/
+      - uses: actions/upload-artifact@v3
+        with:
+          name: textractor-lambda-p311-pdfium
+          path: lambda_layer/
+  lambda-build-p312-pdfium:
+    runs-on: ubuntu-latest
+    container: public.ecr.aws/sam/build-python3.12:latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: 'master'
+      - name: Install and build
+        run: |
+          mkdir -p lambda_layer/python && \
+          cd lambda_layer/python && \
+          pip3 install ../../ --target=. && \
+          pip3 install pypdfium2 --target=. && \
+          cd .. \
+          && zip -r /textractor.zip python/
+      - uses: actions/upload-artifact@v3
+        with:
+          name: textractor-lambda-p312-pdfium
+          path: lambda_layer/
diff --git a/extras/pdfium.txt b/extras/pdfium.txt
@@ -0,0 +1 @@
+pypdfium2
diff --git a/textractor/exceptions.py b/textractor/exceptions.py
@@ -53,3 +53,8 @@ class UnhandledCaseException(Exception):
     """Raised when no statement matched the condition"""
 
     pass
+
+class UnsupportedDocumentException(Exception):
+    """Raised by the Textract API when the document could not be processed"""
+
+    pass
diff --git a/textractor/textractor.py b/textractor/textractor.py
@@ -36,12 +36,15 @@
 from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json
 
 try:
-    from pdf2image import convert_from_bytes, convert_from_path
+    try:
+        import pypdfium2
+    except:
+        import pdf2image
 
-    IS_PDF2IMAGE_INSTALLED = True
+    IS_PDF_RENDERING_ENABLED = True
 except ImportError:
-    IS_PDF2IMAGE_INSTALLED = False
-    logging.info("pdf2image is not installed, client-side PDF rasterizing is disabled")
+    IS_PDF_RENDERING_ENABLED = False
+    logging.info("pypdfium2 and pdf2image are both not installed, client-side PDF rasterizing is disabled")
 
 from textractor.data.constants import (
     TextractAPI,
@@ -51,12 +54,14 @@
 from textractor.entities.lazy_document import LazyDocument
 from textractor.parsers import response_parser
 from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
+from textractor.utils.pdf_utils import rasterize_pdf
 from textractor.exceptions import (
     InputError,
     RegionMismatchError,
     IncorrectMethodException,
     MissingDependencyException,
     UnhandledCaseException,
+    UnsupportedDocumentException,
 )
 
 
@@ -129,8 +134,8 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
             )
             file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
             if filepath.lower().endswith(".pdf"):
-                if IS_PDF2IMAGE_INSTALLED:
-                    images = convert_from_bytes(bytearray(file_obj))
+                if IS_PDF_RENDERING_ENABLED:
+                    images = rasterize_pdf(bytearray(file_obj))
                 else:
                     raise MissingDependencyException(
                         "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
@@ -140,8 +145,8 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
 
         else:
             if filepath.lower().endswith(".pdf"):
-                if IS_PDF2IMAGE_INSTALLED:
-                    images = convert_from_path(filepath)
+                if IS_PDF_RENDERING_ENABLED:
+                    images = rasterize_pdf(filepath)
                 else:
                     raise MissingDependencyException(
                         "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
@@ -155,16 +160,14 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
         return images
 
     def detect_document_text(
-        self, file_source, s3_output_path: str = "", save_image: bool = True
+        self, file_source, save_image: bool = True
     ) -> Document:
         """
         Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.
         This function is ideal for single page PDFs or single images.
 
         :param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
         :type file_source: str or PIL.Image, required
-        :param s3_output_path: S3 path to store the output.
-        :type s3_output_path: str, optional
         :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                             and necessary only if the customer wants to visualize bounding boxes for their document entities.
         :type save_image: bool
@@ -181,12 +184,15 @@ def detect_document_text(
 
         elif isinstance(file_source, str):
             logging.debug("Filepath given.")
-            images = self._get_document_images_from_path(file_source)
-            if len(images) > 1:
-                raise IncorrectMethodException(
-                    "Input contains more than 1 page. Call start_document_text_detection instead."
-                )
-            file_source = _image_to_byte_array(images[0])
+            if not save_image and file_source.lower().endswith(".pdf"):
+                images = []
+            else:
+                images = self._get_document_images_from_path(file_source)
+                if len(images) > 1:
+                    raise IncorrectMethodException(
+                        "Input contains more than 1 page. Call start_document_analysis() instead."
+                    )
+                file_source = _image_to_byte_array(images[0])    
 
         elif isinstance(file_source, Image.Image):
             logging.debug("PIL Image given.")
@@ -202,18 +208,12 @@ def detect_document_text(
             images = []
             raise InputError("Input file_source format not supported.")
 
-        if not s3_output_path:
-            output_config = None
-        else:
-            bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
-            output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)
-
         try:
             response = call_textract(
                 input_document=file_source,
                 features=[],
                 queries_config=None,  # not supported yet
-                output_config=output_config,
+                output_config=None,
                 kms_key_id=self.kms_key_id,
                 job_tag="",
                 notification_channel=None,  # not supported yet
@@ -229,6 +229,10 @@ def detect_document_text(
                 raise RegionMismatchError(
                     "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                 )
+            elif exception.__class__.__name__ == "UnsupportedDocumentException":
+                raise UnsupportedDocumentException(
+                    "Textract returned an UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_text_detection. If your file_source is an image, make sure that it is not larger than 5MB."
+                )
             raise exception
 
         document = response_parser.parse(response)
@@ -343,7 +347,6 @@ def analyze_document(
         file_source,
         features,
         queries: Union[QueriesConfig, List[Query], List[str]] = None,
-        s3_output_path: str = "",
         save_image: bool = True,
     ) -> Document:
         """
@@ -356,8 +359,6 @@ def analyze_document(
         :type features: list, required
         :param queries: Queries to run on the document
         :type features: Union[QueriesConfig, List[Query], List[str]]
-        :param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
-        :type s3_output_path: str, optional
         :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                             and necessary only if the customer wants to visualize bounding boxes for their document entities.
         :type save_image: bool
@@ -373,13 +374,15 @@ def analyze_document(
 
         elif isinstance(file_source, str):
             logging.debug("Filepath given.")
-            images = self._get_document_images_from_path(file_source)
-            if len(images) > 1:
-                raise IncorrectMethodException(
-                    "Input contains more than 1 page. Call start_document_analysis() instead."
-                )
-            file_source = _image_to_byte_array(images[0])
-
+            if not save_image and file_source.lower().endswith(".pdf"):
+                images = []
+            else:
+                images = self._get_document_images_from_path(file_source)
+                if len(images) > 1:
+                    raise IncorrectMethodException(
+                        "Input contains more than 1 page. Call start_document_analysis() instead."
+                    )
+                file_source = _image_to_byte_array(images[0])    
         elif isinstance(file_source, Image.Image):
             logging.debug("PIL Image given.")
             images = [file_source]
@@ -394,12 +397,6 @@ def analyze_document(
             images = []
             raise InputError("Input file_source format not supported.")
 
-        if not s3_output_path:
-            output_config = None
-        else:
-            bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
-            output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)
-
         if not isinstance(features, list):
             features = [features]
 
@@ -429,7 +426,7 @@ def analyze_document(
                 input_document=file_source,
                 features=features,
                 queries_config=queries,  # not supported yet
-                output_config=output_config,
+                output_config=None,
                 kms_key_id=self.kms_key_id,
                 job_tag="",
                 notification_channel=None,  # not supported yet
@@ -445,6 +442,10 @@ def analyze_document(
                 raise RegionMismatchError(
                     "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                 )
+            elif exception.__class__.__name__ == "UnsupportedDocumentException":
+                raise UnsupportedDocumentException(
+                    "Textract returned an UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_analysis. If your file_source is an image, make sure that it is not larger than 5MB."
+                )
             raise exception
 
         document = response_parser.parse(response)

diff --git a/textractor/utils/pdf_utils.py b/textractor/utils/pdf_utils.py
@@ -0,0 +1,33 @@
+import re
+from typing import List, Union
+from PIL import Image
+
+try:
+    import pypdfium2
+    PYPDFIUM2_IS_INSTALLED = True
+except ImportError:
+    PYPDFIUM2_IS_INSTALLED = False
+
+try:
+    from pdf2image import convert_from_bytes, convert_from_path, pdfinfo_from_bytes, pdfinfo_from_path
+    PDF2IMAGE_IS_INSTALLED = True
+except ImportError:
+    PDF2IMAGE_IS_INSTALLED = False
+
+
+def rasterize_pdf(pdf: Union[str, bytes]) -> List[Image.Image]:
+    """
+    Convert a pdf into a list of images
+    """
+    if PYPDFIUM2_IS_INSTALLED:
+        pdf = pypdfium2.PdfDocument(pdf)
+        return [page.render(scale=250 / 72).to_pil() for page in pdf]
+    elif PDF2IMAGE_IS_INSTALLED:
+        if isinstance(pdf, str):
+            return convert_from_path(pdf, dpi=250, fmt="jpeg")
+        elif isinstance(pdf, bytes):
+            return convert_from_bytes(pdf, dpi=250, fmt="jpeg")
+        else:
+            raise Exception(f"{type(pdf)} is not a supported type, should be str or bytes")
+    else:
+        raise Exception("PDF rasterization is not possible if neither pypdfium2 nor pdf2image are installed")