feat: add download_image function

openfoodfacts · Jul 1, 2024 · e58c46b · e58c46b
1 parent d6aff4b
commit e58c46b
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 7 deletions.
diff --git a/openfoodfacts/images.py b/openfoodfacts/images.py
@@ -1,12 +1,17 @@
 import re
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
+import requests
+from PIL import Image
+
 from openfoodfacts.types import Environment, Flavor
-from openfoodfacts.utils import URLBuilder
+from openfoodfacts.utils import URLBuilder, get_image_from_url
 
 BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$")
+# Base URL of the public Open Food Facts S3 bucket
+AWS_S3_BASE_URL = "https://openfoodfacts-images.s3.eu-west-3.amazonaws.com/data"
 
 
 def split_barcode(barcode: str) -> List[str]:
@@ -156,3 +161,51 @@ def extract_source_from_url(url: str) -> str:
         url_path = str(Path(url_path).with_suffix(".jpg"))
 
     return url_path
+
+
+def download_image(
+    image: Union[str, Tuple[str, str]],
+    use_cache: bool = True,
+    error_raise: bool = True,
+    session: Optional[requests.Session] = None,
+    return_bytes: bool = False,
+) -> Union[None, Image.Image, Tuple[Optional[Image.Image], bytes]]:
+    """Download an Open Food Facts image.
+
+    :param image: the image URL or a tuple containing the barcode and the
+        image ID
+    :param use_cache: whether to use the S3 dataset cache, defaults to True
+    :param error_raise: whether to raise an error if the download fails,
+        defaults to True
+    :param session: the requests session to use, defaults to None
+    :param return_bytes: if True, return the image bytes as well, defaults to
+        False.
+    :return: the loaded image or None if an error occured. If `return_bytes`
+        is True, a tuple with the image and the image bytes is returned.
+
+    >>> download_image("https://images.openfoodfacts.org/images/products/324/227/210/2359/4.jpg")
+    <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1244x1500>
+
+    >>> download_image(("3242272102359", "4"))
+    <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1244x1500>
+    """
+    if isinstance(image, str):
+        if use_cache and image.startswith("http"):
+            image_path = extract_source_from_url(image)
+            image_url = f"{AWS_S3_BASE_URL}{image_path}"
+        else:
+            image_url = image
+
+    if isinstance(image, tuple):
+        if use_cache:
+            image_path = generate_image_path(*image)
+            image_url = f"{AWS_S3_BASE_URL}{image_path}"
+        else:
+            image_url = generate_image_url(*image)
+
+    return get_image_from_url(
+        image_url,
+        error_raise=error_raise,
+        session=session,
+        return_bytes=return_bytes,
+    )
diff --git a/openfoodfacts/utils.py b/openfoodfacts/utils.py
@@ -5,7 +5,7 @@
 import time
 from io import BytesIO
 from pathlib import Path
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import requests
 import tqdm
@@ -318,16 +318,22 @@ def get_asset_from_url(
 
 
 def get_image_from_url(
-    image_url: str, error_raise: bool = True, session: Optional[requests.Session] = None
-) -> Optional["Image.Image"]:
+    image_url: str,
+    error_raise: bool = True,
+    session: Optional[requests.Session] = None,
+    return_bytes: bool = False,
+) -> Union[None, "Image.Image", Tuple[Optional["Image.Image"], bytes]]:
     """Fetch an image from `image_url` and load it.
 
     :param image_url: URL of the image to load
     :param error_raise: if True, raises a `AssetLoadingException` if an error
       occured, defaults to False. If False, None is returned if an error
       occured.
     :param session: requests Session to use, by default no session is used.
-    :return: the Pillow Image or None.
+    :param return_bytes: if True, return the image bytes as well, defaults to
+        False.
+    :return: the loaded image or None if an error occured. If `return_bytes`
+        is True, a tuple with the image and the image bytes is returned.
     """
     if not _pillow_available:
         raise ImportError("Pillow is required to load images")
@@ -338,7 +344,10 @@ def get_image_from_url(
     content_bytes = r.content
 
     try:
-        return Image.open(BytesIO(content_bytes))
+        image = Image.open(BytesIO(content_bytes))
+        if return_bytes:
+            return image, content_bytes
+        return image
     except PIL.UnidentifiedImageError:
         error_message = f"Cannot identify image {image_url}"
         if error_raise: