Skip to content

Commit

Permalink
text summarizer local and comment action
Browse files Browse the repository at this point in the history
  • Loading branch information
vinicvaz committed Dec 4, 2023
1 parent e84e43c commit ff5837e
Show file tree
Hide file tree
Showing 8 changed files with 857 additions and 534 deletions.
1,029 changes: 570 additions & 459 deletions .domino/compiled_metadata.json

Large diffs are not rendered by default.

24 changes: 16 additions & 8 deletions .domino/dependencies_map.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,17 @@
"requirements_file": null
},
"pieces": [
"TextGeneratorPiece",
"InformationExtractionPiece",
"AudioTranscriptionPiece",
"PromptCreatorForImageGeneratorPiece",
"TextTaggingPiece",
"TextGeneratorPiece",
"ImageGeneratorPiece",
"TextSummarizerPiece"
"PromptCreatorForImageGeneratorPiece",
"TextSummarizerPiece",
"AudioTranscriptionPiece"
],
"secrets": [
"OPENAI_API_KEY"
],
"source_image": "ghcr.io/tauffer-consulting/openai_domino_pieces:0.7.1-group0"
]
},
"group1": {
"dependency": {
Expand All @@ -26,7 +25,16 @@
"pieces": [
"AudioTranscriptionLocalPiece"
],
"secrets": [],
"source_image": "ghcr.io/tauffer-consulting/openai_domino_pieces:0.7.1-group1"
"secrets": []
},
"group2": {
"dependency": {
"dockerfile": "Dockerfile_summarizer",
"requirements_file": null
},
"pieces": [
"TextSummarizerLocalPiece"
],
"secrets": []
}
}
134 changes: 67 additions & 67 deletions .github/workflows/validate-and-organize.yml
Original file line number Diff line number Diff line change
@@ -1,81 +1,81 @@
name: Validate and organize repo, build and publish Docker images
# name: Validate and organize repo, build and publish Docker images

on:
push:
branches:
- main
paths:
- 'config.toml'
# on:
# push:
# branches:
# - main
# paths:
# - 'config.toml'

jobs:
validate-and-organize:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
# jobs:
# validate-and-organize:
# runs-on: ubuntu-latest
# steps:
# - name: Checkout
# uses: actions/checkout@v3
# with:
# persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
# fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.

- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
# - name: Set up Python 3.10
# uses: actions/setup-python@v3
# with:
# python-version: "3.10"

- name: Pip install packages
run: |
python -m pip install -U pip
pip install --no-cache-dir domino-py[cli]
# - name: Pip install packages
# run: |
# python -m pip install -U pip
# pip install --no-cache-dir domino-py[cli]

- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
# - name: Login to GitHub Container Registry
# uses: docker/login-action@v2
# with:
# registry: ghcr.io
# username: ${{ github.actor }}
# password: ${{ secrets.GITHUB_TOKEN }}

- name: pip list
run: pip list | grep domino
# - name: pip list
# run: pip list | grep domino

- name: Run organize and build images
run: |
domino piece organize --build-images --source-url=https://github.com/${{github.repository}}
# - name: Run organize and build images
# run: |
# domino piece organize --build-images --source-url=https://github.com/${{github.repository}}

- name: Install Tests Dependencies
run: pip install -r requirements-tests.txt
# - name: Install Tests Dependencies
# run: pip install -r requirements-tests.txt

- name: Run tests over built images
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DOMINO_TESTS_ENVIRONMENT: github
run: |
pytest -s -v --cov=pieces --cov-report=xml --cov-report=term-missing
# - name: Run tests over built images
# env:
# OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# DOMINO_TESTS_ENVIRONMENT: github
# run: |
# pytest -s -v --cov=pieces --cov-report=xml --cov-report=term-missing

- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# - name: Upload coverage reports to Codecov
# uses: codecov/codecov-action@v3
# env:
# CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

- name: Publish images
env:
GHCR_USERNAME: ${{ github.actor }}
run: domino piece publish-images --registry-token ${{ secrets.GITHUB_TOKEN }}
# - name: Publish images
# env:
# GHCR_USERNAME: ${{ github.actor }}
# run: domino piece publish-images --registry-token ${{ secrets.GITHUB_TOKEN }}

- name: Commit files
run: |
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
GIT_STATUS=$(git status -s)
[[ ! -z "$GIT_STATUS" ]] && git add .domino/* && git commit -m "auto-organize" -a || echo "No changes to commit"
# - name: Commit files
# run: |
# git config --local user.email "github-actions[bot]@users.noreply.github.com"
# git config --local user.name "github-actions[bot]"
# GIT_STATUS=$(git status -s)
# [[ ! -z "$GIT_STATUS" ]] && git add .domino/* && git commit -m "auto-organize" -a || echo "No changes to commit"

- name: Push changes
uses: ad-m/github-push-action@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
branch: ${{ github.ref }}
# - name: Push changes
# uses: ad-m/github-push-action@master
# with:
# github_token: ${{ secrets.GITHUB_TOKEN }}
# branch: ${{ github.ref }}

- name: Create Release
env:
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: domino piece release
# - name: Create Release
# env:
# GITHUB_REPOSITORY: ${{ github.repository }}
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# run: domino piece release
18 changes: 18 additions & 0 deletions dependencies/Dockerfile_summarizer
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM ghcr.io/tauffer-consulting/domino-base-piece:gpu

# Install specific requirements
ENV TZ=UTC
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update
RUN apt install ffmpeg -y

# Need to copy operators source code
COPY config.toml domino/pieces_repository/
COPY pieces domino/pieces_repository/pieces
COPY .domino domino/pieces_repository/.domino

# Install specific python dependencies
RUN pip install transformers==4.34.1

# Dowload transformers model
RUN python -c "import huggingface_hub; huggingface_hub.snapshot_download('facebook/bart-large-cnn')"
22 changes: 22 additions & 0 deletions pieces/TextSummarizerLocalPiece/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"name": "TextSummarizerLocalPiece",
"description": "Text Summarizer with local model",
"dependency": {
"dockerfile": "Dockerfile_summarizer"
},
"container_resources": {
"use_gpu": true,
"requests": {
"cpu": 1000,
"memory": 3072
},
"limits": {
"cpu": 5000,
"memory": 15360
}
},
"style": {
"node_label": "Text Summarizer Piece",
"icon_class_name": "fa-solid:quote-left"
}
}
48 changes: 48 additions & 0 deletions pieces/TextSummarizerLocalPiece/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from pydantic import BaseModel, Field, FilePath, validators
from typing import Union
from enum import Enum


class OutputTypeType(str, Enum):
xcom = "xcom"
file = "file"


class InputModel(BaseModel):
"""
Input data for TextSummarizerPiece
"""
input_file_path: str = Field(
description='The path to the text file to summarize.',
default=""
)
input_text: str = Field(
description='The text to summarize.',
default=""
)
output_type: OutputTypeType = Field(
description='The type of output fot the result text.',
default=OutputTypeType.xcom
)
use_gpu: bool = Field(
description='Use GPU for inference.',
default=False
)


class OutputModel(BaseModel):
"""
Output data for TextSummarizerPiece
"""
message: str = Field(
default="",
description="Output message to log."
)
summary_result: str = Field(
default="",
description="The result summarized text."
)
file_path: Union[FilePath, str] = Field(
default="",
description="The path to the resulting summarized text file."
)
86 changes: 86 additions & 0 deletions pieces/TextSummarizerLocalPiece/piece.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from domino.base_piece import BasePiece
from .models import InputModel, OutputModel
from transformers import pipeline
import torch


def summarize_long_text(text: str, summarizer, iteration: int=0):
"""
Generate the summary by concatenating the summaries of the individual chunks.
"""
iteration += 1
print(f"Iteration: {iteration}")

# Preprocess text
text = text.lower().replace(".", " ").replace(",", " ").replace("\n", " ")
text = "".join(ch if ch.isalnum() or ch == " " else " " for ch in text)

# Split the input text into chunks
chunk_size = 1000
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
print(f"chunks to process: {len(chunks)}")

# Generate the summary for each chunk
summary_list = [
summarizer(chunk, max_length=60, min_length=30, no_repeat_ngram_size=3)[0]['summary_text']
for chunk in chunks
]
summary = " ".join(summary_list)

if len(summary) > 2000:
return summarize_long_text(summary, summarizer, iteration)
else:
return summary


class TextSummarizerLocalPiece(BasePiece):

def piece_function(self, input_data: InputModel):

# Set device
if input_data.use_gpu and torch.cuda.is_available():
device = torch.cuda.current_device()
self.logger.info("Using GPU for inference.")
else:
device = -1
self.logger.info("Using CPU for inference.")

# Load text
if input_data.input_file_path:
with open(input_data.input_file_path, "r") as f:
text_str = f.read()
else:
text_str = input_data.input_text

# Load summarizer
self.logger.info("Loading summarizer...")
summarizer = pipeline(
task="summarization",
model="facebook/bart-large-cnn",
framework="pt",
device=device
)

# Run summarizer
self.logger.info("Running summarizer...")
result = summarize_long_text(text=text_str, summarizer=summarizer)

# Return result
if input_data.output_type == "xcom":
self.logger.info("Summarization completed successfully. Result returned as XCom.")
msg = f"Summarization completed successfully. Result returned as XCom."
summary_result = result
output_file_path = ""
else:
self.logger.info("Sumamrization completed successfully. Result returned as file.")
msg = f"Summarization completed successfully. Result returned as file."
summary_result = ""
output_file_path = "summary_result.txt"
with open(output_file_path, "w") as f:
f.write(result)

return OutputModel(
message=msg,
summary_result=summary_result,
file_path=output_file_path
)
Loading

0 comments on commit ff5837e

Please sign in to comment.