Skip to content

Commit

Permalink
Merge branch 'main' into dalle-image-gen
Browse files Browse the repository at this point in the history
  • Loading branch information
sjrl authored Oct 31, 2024
2 parents a1e83e8 + 294a67e commit 54590da
Show file tree
Hide file tree
Showing 50 changed files with 2,307 additions and 444 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ on:
- ".github/workflows/e2e.yml"

env:
PYTHON_VERSION: "3.8"
PYTHON_VERSION: "3.9"
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
HATCH_VERSION: "1.13.0"

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/minor_version_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_dispatch:

env:
PYTHON_VERSION: "3.8"
PYTHON_VERSION: "3.9"

jobs:
sync:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/promote_unstable_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
# Exclude 1.x tags
- "!v1.[0-9]+.[0-9]+"
env:
PYTHON_VERSION: "3.8"
PYTHON_VERSION: "3.9"

jobs:
promote:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ env:
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
PYTHON_VERSION: "3.8"
PYTHON_VERSION: "3.9"
HATCH_VERSION: "1.13.0"

jobs:
Expand Down
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,23 @@ The simplest way to get Haystack is via pip:
pip install haystack-ai
```

Install from the `main` branch to try the newest features:
```sh
pip install git+https://github.com/deepset-ai/haystack.git@main
```

Haystack supports multiple installation methods including Docker images. For a comprehensive guide please refer
to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/installation).
to the [documentation](https://docs.haystack.deepset.ai/docs/installation).

## Documentation

If you're new to the project, check out ["What is Haystack?"](https://haystack.deepset.ai/overview/intro) then go
through the ["Get Started Guide"](https://haystack.deepset.ai/overview/quick-start) and build your first LLM application
in a matter of minutes. Keep learning with the [tutorials](https://haystack.deepset.ai/tutorials?v=2.0). For more advanced
in a matter of minutes. Keep learning with the [tutorials](https://haystack.deepset.ai/tutorials). For more advanced
use cases, or just to get some inspiration, you can browse our Haystack recipes in the
[Cookbook](https://haystack.deepset.ai/cookbook).

At any given point, hit the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/intro) to learn more about Haystack, what can it do for you and the technology behind.
At any given point, hit the [documentation](https://docs.haystack.deepset.ai/docs/intro) to learn more about Haystack, what can it do for you and the technology behind.

## Features

Expand Down Expand Up @@ -74,7 +79,7 @@ Use **deepset Studio** to visually create and export your Haystack pipeline arch

Haystack collects **anonymous** usage statistics of pipeline components. We receive an event every time these components are initialized. This way, we know which components are most relevant to our community.

Read more about telemetry in Haystack or how you can opt out in [Haystack docs](https://docs.haystack.deepset.ai/v2.0/docs/telemetry).
Read more about telemetry in Haystack or how you can opt out in [Haystack docs](https://docs.haystack.deepset.ai/docs/telemetry).

## 🖖 Community

Expand Down
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.7.0-rc0
2.8.0-rc0
2 changes: 1 addition & 1 deletion docs/pydoc/config/joiners_api.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/joiners]
modules: ["document_joiner", "branch", "answer_joiner"]
modules: ["document_joiner", "branch", "answer_joiner", "string_joiner"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
211 changes: 165 additions & 46 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
#
# SPDX-License-Identifier: Apache-2.0

import csv
import io
from dataclasses import dataclass
from enum import Enum
from io import StringIO
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack import Document, component, logging
from haystack import Document, component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
Expand All @@ -17,6 +20,7 @@
with LazyImport("Run 'pip install python-docx'") as docx_import:
import docx
from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.paragraph import Paragraph


Expand Down Expand Up @@ -59,6 +63,30 @@ class DOCXMetadata:
version: str


class DOCXTableFormat(Enum):
"""
Supported formats for storing DOCX tabular data in a Document.
"""

MARKDOWN = "markdown"
CSV = "csv"

def __str__(self):
return self.value

@staticmethod
def from_str(string: str) -> "DOCXTableFormat":
"""
Convert a string to a DOCXTableFormat enum.
"""
enum_map = {e.value: e for e in DOCXTableFormat}
table_format = enum_map.get(string.lower())
if table_format is None:
msg = f"Unknown table format '{string}'. Supported formats are: {list(enum_map.keys())}"
raise ValueError(msg)
return table_format


@component
class DOCXToDocument:
"""
Expand All @@ -69,21 +97,48 @@ class DOCXToDocument:
Usage example:
```python
from haystack.components.converters.docx import DOCXToDocument
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
converter = DOCXToDocument()
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the DOCX file.'
```
"""

def __init__(self):
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV):
"""
Create a DOCXToDocument component.
:param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
"""
docx_import.check()
self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, table_format=str(self.table_format))

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
"""
Deserializes the component from a dictionary.
:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
if "table_format" in data["init_parameters"]:
data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -118,9 +173,9 @@ def run(
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
file = docx.Document(io.BytesIO(bytestream.data))
paragraphs = self._extract_paragraphs_with_page_breaks(file.paragraphs)
text = "\n".join(paragraphs)
docx_document = docx.Document(io.BytesIO(bytestream.data))
elements = self._extract_elements(docx_document)
text = "\n".join(elements)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to a DOCX Document, skipping. Error: {error}",
Expand All @@ -129,52 +184,116 @@ def run(
)
continue

docx_metadata = self._get_docx_metadata(document=file)
docx_metadata = self._get_docx_metadata(document=docx_document)
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}

def _extract_paragraphs_with_page_breaks(self, paragraphs: List["Paragraph"]) -> List[str]:
def _extract_elements(self, document: "DocxDocument") -> List[str]:
"""
Extracts paragraphs from a DOCX file, including page breaks.
Extracts elements from a DOCX file.
Page breaks (both soft and hard page breaks) are not automatically extracted by python-docx as '\f' chars.
This means we need to add them in ourselves, as done here. This allows the correct page number
to be associated with each document if the file contents are split, e.g. by DocumentSplitter.
:param document: The DOCX Document object.
:returns: List of strings (paragraph texts and table representations) with page breaks added as '\f' characters.
"""
elements = []
for element in document.element.body:
if element.tag.endswith("p"):
paragraph = Paragraph(element, document)
if paragraph.contains_page_break:
para_text = self._process_paragraph_with_page_breaks(paragraph)
else:
para_text = paragraph.text
elements.append(para_text)
elif element.tag.endswith("tbl"):
table = docx.table.Table(element, document)
table_str = (
self._table_to_markdown(table)
if self.table_format == DOCXTableFormat.MARKDOWN
else self._table_to_csv(table)
)
elements.append(table_str)

:param paragraphs:
List of paragraphs from a DOCX file.
return elements

:returns:
List of strings (paragraph text fields) with all page breaks added in as '\f' characters.
"""
paragraph_texts = []
for para in paragraphs:
if para.contains_page_break:
para_text_w_page_breaks = ""
# Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
for pb_index, page_break in enumerate(para.rendered_page_breaks):
# Can only extract text from first paragraph page break, unfortunately
if pb_index == 0:
if page_break.preceding_paragraph_fragment:
para_text_w_page_breaks += page_break.preceding_paragraph_fragment.text
para_text_w_page_breaks += "\f"
if page_break.following_paragraph_fragment:
# following_paragraph_fragment contains all text for remainder of paragraph.
# However, if the remainder of the paragraph spans multiple page breaks, it won't include
# those later page breaks so we have to add them at end of text in the `else` block below.
# This is not ideal, but this case should be very rare and this is likely good enough.
para_text_w_page_breaks += page_break.following_paragraph_fragment.text
else:
para_text_w_page_breaks += "\f"

paragraph_texts.append(para_text_w_page_breaks)
def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
"""
Processes a paragraph with page breaks.
:param paragraph: The DOCX paragraph to process.
:returns: A string with page breaks added as '\f' characters.
"""
para_text = ""
# Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
for pb_index, page_break in enumerate(paragraph.rendered_page_breaks):
# Can only extract text from first paragraph page break, unfortunately
if pb_index == 0:
if page_break.preceding_paragraph_fragment:
para_text += page_break.preceding_paragraph_fragment.text
para_text += "\f"
if page_break.following_paragraph_fragment:
# following_paragraph_fragment contains all text for remainder of paragraph.
# However, if the remainder of the paragraph spans multiple page breaks, it won't include
# those later page breaks so we have to add them at end of text in the `else` block below.
# This is not ideal, but this case should be very rare and this is likely good enough.
para_text += page_break.following_paragraph_fragment.text
else:
paragraph_texts.append(para.text)
para_text += "\f"
return para_text

def _table_to_markdown(self, table: "Table") -> str:
"""
Converts a DOCX table to a Markdown string.
:param table: The DOCX table to convert.
:returns: A Markdown string representation of the table.
"""
markdown: List[str] = []
max_col_widths: List[int] = []

# Calculate max width for each column
for row in table.rows:
for i, cell in enumerate(row.cells):
cell_text = cell.text.strip()
if i >= len(max_col_widths):
max_col_widths.append(len(cell_text))
else:
max_col_widths[i] = max(max_col_widths[i], len(cell_text))

# Process rows
for i, row in enumerate(table.rows):
md_row = [cell.text.strip().ljust(max_col_widths[j]) for j, cell in enumerate(row.cells)]
markdown.append("| " + " | ".join(md_row) + " |")

# Add separator after header row
if i == 0:
separator = ["-" * max_col_widths[j] for j in range(len(row.cells))]
markdown.append("| " + " | ".join(separator) + " |")

return "\n".join(markdown)

def _table_to_csv(self, table: "Table") -> str:
"""
Converts a DOCX table to a CSV string.
:param table: The DOCX table to convert.
:returns: A CSV string representation of the table.
"""
csv_output = StringIO()
csv_writer = csv.writer(csv_output, quoting=csv.QUOTE_MINIMAL)

# Process rows
for row in table.rows:
csv_row = [cell.text.strip() for cell in row.cells]
csv_writer.writerow(csv_row)

# Get the CSV as a string and strip any trailing newlines
csv_string = csv_output.getvalue().strip()
csv_output.close()

return paragraph_texts
return csv_string

def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
"""
Expand All @@ -191,15 +310,15 @@ def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
category=document.core_properties.category,
comments=document.core_properties.comments,
content_status=document.core_properties.content_status,
created=document.core_properties.created.isoformat() if document.core_properties.created else None,
created=(document.core_properties.created.isoformat() if document.core_properties.created else None),
identifier=document.core_properties.identifier,
keywords=document.core_properties.keywords,
language=document.core_properties.language,
last_modified_by=document.core_properties.last_modified_by,
last_printed=document.core_properties.last_printed.isoformat()
if document.core_properties.last_printed
else None,
modified=document.core_properties.modified.isoformat() if document.core_properties.modified else None,
last_printed=(
document.core_properties.last_printed.isoformat() if document.core_properties.last_printed else None
),
modified=(document.core_properties.modified.isoformat() if document.core_properties.modified else None),
revision=document.core_properties.revision,
subject=document.core_properties.subject,
title=document.core_properties.title,
Expand Down
Loading

0 comments on commit 54590da

Please sign in to comment.