diff --git a/openfoodfacts/images.py b/openfoodfacts/images.py index 3333072..4d1ac52 100644 --- a/openfoodfacts/images.py +++ b/openfoodfacts/images.py @@ -1,12 +1,17 @@ import re from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Tuple, Union from urllib.parse import urlparse +import requests +from PIL import Image + from openfoodfacts.types import Environment, Flavor -from openfoodfacts.utils import URLBuilder +from openfoodfacts.utils import URLBuilder, get_image_from_url BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$") +# Base URL of the public Open Food Facts S3 bucket +AWS_S3_BASE_URL = "https://openfoodfacts-images.s3.eu-west-3.amazonaws.com/data" def split_barcode(barcode: str) -> List[str]: @@ -156,3 +161,51 @@ def extract_source_from_url(url: str) -> str: url_path = str(Path(url_path).with_suffix(".jpg")) return url_path + + +def download_image( + image: Union[str, Tuple[str, str]], + use_cache: bool = True, + error_raise: bool = True, + session: Optional[requests.Session] = None, + return_bytes: bool = False, +) -> Union[None, Image.Image, Tuple[Optional[Image.Image], bytes]]: + """Download an Open Food Facts image. + + :param image: the image URL or a tuple containing the barcode and the + image ID + :param use_cache: whether to use the S3 dataset cache, defaults to True + :param error_raise: whether to raise an error if the download fails, + defaults to True + :param session: the requests session to use, defaults to None + :param return_bytes: if True, return the image bytes as well, defaults to + False. + :return: the loaded image or None if an error occured. If `return_bytes` + is True, a tuple with the image and the image bytes is returned. + + >>> download_image("https://images.openfoodfacts.org/images/products/324/227/210/2359/4.jpg") + + + >>> download_image(("3242272102359", "4")) + + """ + if isinstance(image, str): + if use_cache and image.startswith("http"): + image_path = extract_source_from_url(image) + image_url = f"{AWS_S3_BASE_URL}{image_path}" + else: + image_url = image + + if isinstance(image, tuple): + if use_cache: + image_path = generate_image_path(*image) + image_url = f"{AWS_S3_BASE_URL}{image_path}" + else: + image_url = generate_image_url(*image) + + return get_image_from_url( + image_url, + error_raise=error_raise, + session=session, + return_bytes=return_bytes, + ) diff --git a/openfoodfacts/utils.py b/openfoodfacts/utils.py index a4fbb1a..f5a0905 100644 --- a/openfoodfacts/utils.py +++ b/openfoodfacts/utils.py @@ -5,7 +5,7 @@ import time from io import BytesIO from pathlib import Path -from typing import Callable, Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union import requests import tqdm @@ -318,8 +318,11 @@ def get_asset_from_url( def get_image_from_url( - image_url: str, error_raise: bool = True, session: Optional[requests.Session] = None -) -> Optional["Image.Image"]: + image_url: str, + error_raise: bool = True, + session: Optional[requests.Session] = None, + return_bytes: bool = False, +) -> Union[None, "Image.Image", Tuple[Optional["Image.Image"], bytes]]: """Fetch an image from `image_url` and load it. :param image_url: URL of the image to load @@ -327,7 +330,10 @@ def get_image_from_url( occured, defaults to False. If False, None is returned if an error occured. :param session: requests Session to use, by default no session is used. - :return: the Pillow Image or None. + :param return_bytes: if True, return the image bytes as well, defaults to + False. + :return: the loaded image or None if an error occured. If `return_bytes` + is True, a tuple with the image and the image bytes is returned. """ if not _pillow_available: raise ImportError("Pillow is required to load images") @@ -338,7 +344,10 @@ def get_image_from_url( content_bytes = r.content try: - return Image.open(BytesIO(content_bytes)) + image = Image.open(BytesIO(content_bytes)) + if return_bytes: + return image, content_bytes + return image except PIL.UnidentifiedImageError: error_message = f"Cannot identify image {image_url}" if error_raise: