Merge branch 'main' into dalle-image-gen

deepset-ai · Oct 31, 2024 · 54590da · 54590da
2 parents a1e83e8 + 294a67e
commit 54590da
Show file tree

Hide file tree

Showing 50 changed files with 2,307 additions and 444 deletions.
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -16,7 +16,7 @@ on:
       - ".github/workflows/e2e.yml"
 
 env:
-  PYTHON_VERSION: "3.8"
+  PYTHON_VERSION: "3.9"
   OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
   HATCH_VERSION: "1.13.0"
 

diff --git a/.github/workflows/minor_version_release.yml b/.github/workflows/minor_version_release.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
 
 env:
-  PYTHON_VERSION: "3.8"
+  PYTHON_VERSION: "3.9"
 
 jobs:
   sync:

diff --git a/.github/workflows/promote_unstable_docs.yml b/.github/workflows/promote_unstable_docs.yml
@@ -8,7 +8,7 @@ on:
       # Exclude 1.x tags
       - "!v1.[0-9]+.[0-9]+"
 env:
-  PYTHON_VERSION: "3.8"
+  PYTHON_VERSION: "3.9"
 
 jobs:
   promote:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -29,7 +29,7 @@ env:
   AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
-  PYTHON_VERSION: "3.8"
+  PYTHON_VERSION: "3.9"
   HATCH_VERSION: "1.13.0"
 
 jobs:

diff --git a/README.md b/README.md
@@ -22,18 +22,23 @@ The simplest way to get Haystack is via pip:
 pip install haystack-ai
 ```
 
+Install from the `main` branch to try the newest features:
+```sh
+pip install git+https://github.com/deepset-ai/haystack.git@main
+```
+
 Haystack supports multiple installation methods including Docker images. For a comprehensive guide please refer
-to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/installation).
+to the [documentation](https://docs.haystack.deepset.ai/docs/installation).
 
 ## Documentation
 
 If you're new to the project, check out ["What is Haystack?"](https://haystack.deepset.ai/overview/intro) then go
 through the ["Get Started Guide"](https://haystack.deepset.ai/overview/quick-start) and build your first LLM application
-in a matter of minutes. Keep learning with the [tutorials](https://haystack.deepset.ai/tutorials?v=2.0). For more advanced
+in a matter of minutes. Keep learning with the [tutorials](https://haystack.deepset.ai/tutorials). For more advanced
 use cases, or just to get some inspiration, you can browse our Haystack recipes in the
 [Cookbook](https://haystack.deepset.ai/cookbook).
 
-At any given point, hit the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/intro) to learn more about Haystack, what can it do for you and the technology behind.
+At any given point, hit the [documentation](https://docs.haystack.deepset.ai/docs/intro) to learn more about Haystack, what can it do for you and the technology behind.
 
 ## Features
 
@@ -74,7 +79,7 @@ Use **deepset Studio** to visually create and export your Haystack pipeline arch
 
 Haystack collects **anonymous** usage statistics of pipeline components. We receive an event every time these components are initialized. This way, we know which components are most relevant to our community.
 
-Read more about telemetry in Haystack or how you can opt out in [Haystack docs](https://docs.haystack.deepset.ai/v2.0/docs/telemetry).
+Read more about telemetry in Haystack or how you can opt out in [Haystack docs](https://docs.haystack.deepset.ai/docs/telemetry).
 
 ## 🖖 Community
 

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-2.7.0-rc0
+2.8.0-rc0
diff --git a/docs/pydoc/config/joiners_api.yml b/docs/pydoc/config/joiners_api.yml
@@ -1,7 +1,7 @@
 loaders:
   - type: haystack_pydoc_tools.loaders.CustomPythonLoader
     search_path: [../../../haystack/components/joiners]
-    modules: ["document_joiner", "branch", "answer_joiner"]
+    modules: ["document_joiner", "branch", "answer_joiner", "string_joiner"]
     ignore_when_discovered: ["__init__"]
 processors:
   - type: filter

diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
@@ -2,12 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import csv
 import io
 from dataclasses import dataclass
+from enum import Enum
+from io import StringIO
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
-from haystack import Document, component, logging
+from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
@@ -17,6 +20,7 @@
 with LazyImport("Run 'pip install python-docx'") as docx_import:
     import docx
     from docx.document import Document as DocxDocument
+    from docx.table import Table
     from docx.text.paragraph import Paragraph
 
 
@@ -59,6 +63,30 @@ class DOCXMetadata:
     version: str
 
 
+class DOCXTableFormat(Enum):
+    """
+    Supported formats for storing DOCX tabular data in a Document.
+    """
+
+    MARKDOWN = "markdown"
+    CSV = "csv"
+
+    def __str__(self):
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "DOCXTableFormat":
+        """
+        Convert a string to a DOCXTableFormat enum.
+        """
+        enum_map = {e.value: e for e in DOCXTableFormat}
+        table_format = enum_map.get(string.lower())
+        if table_format is None:
+            msg = f"Unknown table format '{string}'. Supported formats are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return table_format
+
+
 @component
 class DOCXToDocument:
     """
@@ -69,21 +97,48 @@ class DOCXToDocument:
 
     Usage example:
     ```python
-    from haystack.components.converters.docx import DOCXToDocument
+    from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
 
-    converter = DOCXToDocument()
+    converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
     results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the DOCX file.'
     ```
     """
 
-    def __init__(self):
+    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV):
         """
         Create a DOCXToDocument component.
+
+        :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
+            DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
         """
         docx_import.check()
+        self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(self, table_format=str(self.table_format))
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
+        """
+        if "table_format" in data["init_parameters"]:
+            data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
+        return default_from_dict(cls, data)
 
     @component.output_types(documents=List[Document])
     def run(
@@ -118,9 +173,9 @@ def run(
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
             try:
-                file = docx.Document(io.BytesIO(bytestream.data))
-                paragraphs = self._extract_paragraphs_with_page_breaks(file.paragraphs)
-                text = "\n".join(paragraphs)
+                docx_document = docx.Document(io.BytesIO(bytestream.data))
+                elements = self._extract_elements(docx_document)
+                text = "\n".join(elements)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to a DOCX Document, skipping. Error: {error}",
@@ -129,52 +184,116 @@ def run(
                 )
                 continue
 
-            docx_metadata = self._get_docx_metadata(document=file)
+            docx_metadata = self._get_docx_metadata(document=docx_document)
             merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
             document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
 
-    def _extract_paragraphs_with_page_breaks(self, paragraphs: List["Paragraph"]) -> List[str]:
+    def _extract_elements(self, document: "DocxDocument") -> List[str]:
         """
-        Extracts paragraphs from a DOCX file, including page breaks.
+        Extracts elements from a DOCX file.
 
-        Page breaks (both soft and hard page breaks) are not automatically extracted by python-docx as '\f' chars.
-        This means we need to add them in ourselves, as done here. This allows the correct page number
-        to be associated with each document if the file contents are split, e.g. by DocumentSplitter.
+        :param document: The DOCX Document object.
+        :returns: List of strings (paragraph texts and table representations) with page breaks added as '\f' characters.
+        """
+        elements = []
+        for element in document.element.body:
+            if element.tag.endswith("p"):
+                paragraph = Paragraph(element, document)
+                if paragraph.contains_page_break:
+                    para_text = self._process_paragraph_with_page_breaks(paragraph)
+                else:
+                    para_text = paragraph.text
+                elements.append(para_text)
+            elif element.tag.endswith("tbl"):
+                table = docx.table.Table(element, document)
+                table_str = (
+                    self._table_to_markdown(table)
+                    if self.table_format == DOCXTableFormat.MARKDOWN
+                    else self._table_to_csv(table)
+                )
+                elements.append(table_str)
 
-        :param paragraphs:
-            List of paragraphs from a DOCX file.
+        return elements
 
-        :returns:
-            List of strings (paragraph text fields) with all page breaks added in as '\f' characters.
-        """
-        paragraph_texts = []
-        for para in paragraphs:
-            if para.contains_page_break:
-                para_text_w_page_breaks = ""
-                # Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
-                for pb_index, page_break in enumerate(para.rendered_page_breaks):
-                    # Can only extract text from first paragraph page break, unfortunately
-                    if pb_index == 0:
-                        if page_break.preceding_paragraph_fragment:
-                            para_text_w_page_breaks += page_break.preceding_paragraph_fragment.text
-                        para_text_w_page_breaks += "\f"
-                        if page_break.following_paragraph_fragment:
-                            # following_paragraph_fragment contains all text for remainder of paragraph.
-                            # However, if the remainder of the paragraph spans multiple page breaks, it won't include
-                            # those later page breaks so we have to add them at end of text in the `else` block below.
-                            # This is not ideal, but this case should be very rare and this is likely good enough.
-                            para_text_w_page_breaks += page_break.following_paragraph_fragment.text
-                    else:
-                        para_text_w_page_breaks += "\f"
-
-                paragraph_texts.append(para_text_w_page_breaks)
+    def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
+        """
+        Processes a paragraph with page breaks.
+
+        :param paragraph: The DOCX paragraph to process.
+        :returns: A string with page breaks added as '\f' characters.
+        """
+        para_text = ""
+        # Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
+        for pb_index, page_break in enumerate(paragraph.rendered_page_breaks):
+            # Can only extract text from first paragraph page break, unfortunately
+            if pb_index == 0:
+                if page_break.preceding_paragraph_fragment:
+                    para_text += page_break.preceding_paragraph_fragment.text
+                para_text += "\f"
+                if page_break.following_paragraph_fragment:
+                    # following_paragraph_fragment contains all text for remainder of paragraph.
+                    # However, if the remainder of the paragraph spans multiple page breaks, it won't include
+                    # those later page breaks so we have to add them at end of text in the `else` block below.
+                    # This is not ideal, but this case should be very rare and this is likely good enough.
+                    para_text += page_break.following_paragraph_fragment.text
             else:
-                paragraph_texts.append(para.text)
+                para_text += "\f"
+        return para_text
+
+    def _table_to_markdown(self, table: "Table") -> str:
+        """
+        Converts a DOCX table to a Markdown string.
+
+        :param table: The DOCX table to convert.
+        :returns: A Markdown string representation of the table.
+        """
+        markdown: List[str] = []
+        max_col_widths: List[int] = []
+
+        # Calculate max width for each column
+        for row in table.rows:
+            for i, cell in enumerate(row.cells):
+                cell_text = cell.text.strip()
+                if i >= len(max_col_widths):
+                    max_col_widths.append(len(cell_text))
+                else:
+                    max_col_widths[i] = max(max_col_widths[i], len(cell_text))
+
+        # Process rows
+        for i, row in enumerate(table.rows):
+            md_row = [cell.text.strip().ljust(max_col_widths[j]) for j, cell in enumerate(row.cells)]
+            markdown.append("| " + " | ".join(md_row) + " |")
+
+            # Add separator after header row
+            if i == 0:
+                separator = ["-" * max_col_widths[j] for j in range(len(row.cells))]
+                markdown.append("| " + " | ".join(separator) + " |")
+
+        return "\n".join(markdown)
+
+    def _table_to_csv(self, table: "Table") -> str:
+        """
+        Converts a DOCX table to a CSV string.
+
+        :param table: The DOCX table to convert.
+        :returns: A CSV string representation of the table.
+        """
+        csv_output = StringIO()
+        csv_writer = csv.writer(csv_output, quoting=csv.QUOTE_MINIMAL)
+
+        # Process rows
+        for row in table.rows:
+            csv_row = [cell.text.strip() for cell in row.cells]
+            csv_writer.writerow(csv_row)
+
+        # Get the CSV as a string and strip any trailing newlines
+        csv_string = csv_output.getvalue().strip()
+        csv_output.close()
 
-        return paragraph_texts
+        return csv_string
 
     def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
         """
@@ -191,15 +310,15 @@ def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
             category=document.core_properties.category,
             comments=document.core_properties.comments,
             content_status=document.core_properties.content_status,
-            created=document.core_properties.created.isoformat() if document.core_properties.created else None,
+            created=(document.core_properties.created.isoformat() if document.core_properties.created else None),
             identifier=document.core_properties.identifier,
             keywords=document.core_properties.keywords,
             language=document.core_properties.language,
             last_modified_by=document.core_properties.last_modified_by,
-            last_printed=document.core_properties.last_printed.isoformat()
-            if document.core_properties.last_printed
-            else None,
-            modified=document.core_properties.modified.isoformat() if document.core_properties.modified else None,
+            last_printed=(
+                document.core_properties.last_printed.isoformat() if document.core_properties.last_printed else None
+            ),
+            modified=(document.core_properties.modified.isoformat() if document.core_properties.modified else None),
             revision=document.core_properties.revision,
             subject=document.core_properties.subject,
             title=document.core_properties.title,