generated from Tauffer-Consulting/domino_pieces_repository_template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
text summarizer local and comment action
- Loading branch information
Showing
8 changed files
with
857 additions
and
534 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,81 +1,81 @@ | ||
name: Validate and organize repo, build and publish Docker images | ||
# name: Validate and organize repo, build and publish Docker images | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
paths: | ||
- 'config.toml' | ||
# on: | ||
# push: | ||
# branches: | ||
# - main | ||
# paths: | ||
# - 'config.toml' | ||
|
||
jobs: | ||
validate-and-organize: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
with: | ||
persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token. | ||
fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. | ||
# jobs: | ||
# validate-and-organize: | ||
# runs-on: ubuntu-latest | ||
# steps: | ||
# - name: Checkout | ||
# uses: actions/checkout@v3 | ||
# with: | ||
# persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token. | ||
# fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. | ||
|
||
- name: Set up Python 3.10 | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: "3.10" | ||
# - name: Set up Python 3.10 | ||
# uses: actions/setup-python@v3 | ||
# with: | ||
# python-version: "3.10" | ||
|
||
- name: Pip install packages | ||
run: | | ||
python -m pip install -U pip | ||
pip install --no-cache-dir domino-py[cli] | ||
# - name: Pip install packages | ||
# run: | | ||
# python -m pip install -U pip | ||
# pip install --no-cache-dir domino-py[cli] | ||
|
||
- name: Login to GitHub Container Registry | ||
uses: docker/login-action@v2 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
# - name: Login to GitHub Container Registry | ||
# uses: docker/login-action@v2 | ||
# with: | ||
# registry: ghcr.io | ||
# username: ${{ github.actor }} | ||
# password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: pip list | ||
run: pip list | grep domino | ||
# - name: pip list | ||
# run: pip list | grep domino | ||
|
||
- name: Run organize and build images | ||
run: | | ||
domino piece organize --build-images --source-url=https://github.com/${{github.repository}} | ||
# - name: Run organize and build images | ||
# run: | | ||
# domino piece organize --build-images --source-url=https://github.com/${{github.repository}} | ||
|
||
- name: Install Tests Dependencies | ||
run: pip install -r requirements-tests.txt | ||
# - name: Install Tests Dependencies | ||
# run: pip install -r requirements-tests.txt | ||
|
||
- name: Run tests over built images | ||
env: | ||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
DOMINO_TESTS_ENVIRONMENT: github | ||
run: | | ||
pytest -s -v --cov=pieces --cov-report=xml --cov-report=term-missing | ||
# - name: Run tests over built images | ||
# env: | ||
# OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
# DOMINO_TESTS_ENVIRONMENT: github | ||
# run: | | ||
# pytest -s -v --cov=pieces --cov-report=xml --cov-report=term-missing | ||
|
||
- name: Upload coverage reports to Codecov | ||
uses: codecov/codecov-action@v3 | ||
env: | ||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} | ||
# - name: Upload coverage reports to Codecov | ||
# uses: codecov/codecov-action@v3 | ||
# env: | ||
# CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} | ||
|
||
- name: Publish images | ||
env: | ||
GHCR_USERNAME: ${{ github.actor }} | ||
run: domino piece publish-images --registry-token ${{ secrets.GITHUB_TOKEN }} | ||
# - name: Publish images | ||
# env: | ||
# GHCR_USERNAME: ${{ github.actor }} | ||
# run: domino piece publish-images --registry-token ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Commit files | ||
run: | | ||
git config --local user.email "github-actions[bot]@users.noreply.github.com" | ||
git config --local user.name "github-actions[bot]" | ||
GIT_STATUS=$(git status -s) | ||
[[ ! -z "$GIT_STATUS" ]] && git add .domino/* && git commit -m "auto-organize" -a || echo "No changes to commit" | ||
# - name: Commit files | ||
# run: | | ||
# git config --local user.email "github-actions[bot]@users.noreply.github.com" | ||
# git config --local user.name "github-actions[bot]" | ||
# GIT_STATUS=$(git status -s) | ||
# [[ ! -z "$GIT_STATUS" ]] && git add .domino/* && git commit -m "auto-organize" -a || echo "No changes to commit" | ||
|
||
- name: Push changes | ||
uses: ad-m/github-push-action@master | ||
with: | ||
github_token: ${{ secrets.GITHUB_TOKEN }} | ||
branch: ${{ github.ref }} | ||
# - name: Push changes | ||
# uses: ad-m/github-push-action@master | ||
# with: | ||
# github_token: ${{ secrets.GITHUB_TOKEN }} | ||
# branch: ${{ github.ref }} | ||
|
||
- name: Create Release | ||
env: | ||
GITHUB_REPOSITORY: ${{ github.repository }} | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
run: domino piece release | ||
# - name: Create Release | ||
# env: | ||
# GITHUB_REPOSITORY: ${{ github.repository }} | ||
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
# run: domino piece release |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
FROM ghcr.io/tauffer-consulting/domino-base-piece:gpu | ||
|
||
# Install specific requirements | ||
ENV TZ=UTC | ||
ARG DEBIAN_FRONTEND=noninteractive | ||
RUN apt update | ||
RUN apt install ffmpeg -y | ||
|
||
# Need to copy operators source code | ||
COPY config.toml domino/pieces_repository/ | ||
COPY pieces domino/pieces_repository/pieces | ||
COPY .domino domino/pieces_repository/.domino | ||
|
||
# Install specific python dependencies | ||
RUN pip install transformers==4.34.1 | ||
|
||
# Dowload transformers model | ||
RUN python -c "import huggingface_hub; huggingface_hub.snapshot_download('facebook/bart-large-cnn')" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
{ | ||
"name": "TextSummarizerLocalPiece", | ||
"description": "Text Summarizer with local model", | ||
"dependency": { | ||
"dockerfile": "Dockerfile_summarizer" | ||
}, | ||
"container_resources": { | ||
"use_gpu": true, | ||
"requests": { | ||
"cpu": 1000, | ||
"memory": 3072 | ||
}, | ||
"limits": { | ||
"cpu": 5000, | ||
"memory": 15360 | ||
} | ||
}, | ||
"style": { | ||
"node_label": "Text Summarizer Piece", | ||
"icon_class_name": "fa-solid:quote-left" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from pydantic import BaseModel, Field, FilePath, validators | ||
from typing import Union | ||
from enum import Enum | ||
|
||
|
||
class OutputTypeType(str, Enum): | ||
xcom = "xcom" | ||
file = "file" | ||
|
||
|
||
class InputModel(BaseModel): | ||
""" | ||
Input data for TextSummarizerPiece | ||
""" | ||
input_file_path: str = Field( | ||
description='The path to the text file to summarize.', | ||
default="" | ||
) | ||
input_text: str = Field( | ||
description='The text to summarize.', | ||
default="" | ||
) | ||
output_type: OutputTypeType = Field( | ||
description='The type of output fot the result text.', | ||
default=OutputTypeType.xcom | ||
) | ||
use_gpu: bool = Field( | ||
description='Use GPU for inference.', | ||
default=False | ||
) | ||
|
||
|
||
class OutputModel(BaseModel): | ||
""" | ||
Output data for TextSummarizerPiece | ||
""" | ||
message: str = Field( | ||
default="", | ||
description="Output message to log." | ||
) | ||
summary_result: str = Field( | ||
default="", | ||
description="The result summarized text." | ||
) | ||
file_path: Union[FilePath, str] = Field( | ||
default="", | ||
description="The path to the resulting summarized text file." | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from domino.base_piece import BasePiece | ||
from .models import InputModel, OutputModel | ||
from transformers import pipeline | ||
import torch | ||
|
||
|
||
def summarize_long_text(text: str, summarizer, iteration: int=0): | ||
""" | ||
Generate the summary by concatenating the summaries of the individual chunks. | ||
""" | ||
iteration += 1 | ||
print(f"Iteration: {iteration}") | ||
|
||
# Preprocess text | ||
text = text.lower().replace(".", " ").replace(",", " ").replace("\n", " ") | ||
text = "".join(ch if ch.isalnum() or ch == " " else " " for ch in text) | ||
|
||
# Split the input text into chunks | ||
chunk_size = 1000 | ||
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | ||
print(f"chunks to process: {len(chunks)}") | ||
|
||
# Generate the summary for each chunk | ||
summary_list = [ | ||
summarizer(chunk, max_length=60, min_length=30, no_repeat_ngram_size=3)[0]['summary_text'] | ||
for chunk in chunks | ||
] | ||
summary = " ".join(summary_list) | ||
|
||
if len(summary) > 2000: | ||
return summarize_long_text(summary, summarizer, iteration) | ||
else: | ||
return summary | ||
|
||
|
||
class TextSummarizerLocalPiece(BasePiece): | ||
|
||
def piece_function(self, input_data: InputModel): | ||
|
||
# Set device | ||
if input_data.use_gpu and torch.cuda.is_available(): | ||
device = torch.cuda.current_device() | ||
self.logger.info("Using GPU for inference.") | ||
else: | ||
device = -1 | ||
self.logger.info("Using CPU for inference.") | ||
|
||
# Load text | ||
if input_data.input_file_path: | ||
with open(input_data.input_file_path, "r") as f: | ||
text_str = f.read() | ||
else: | ||
text_str = input_data.input_text | ||
|
||
# Load summarizer | ||
self.logger.info("Loading summarizer...") | ||
summarizer = pipeline( | ||
task="summarization", | ||
model="facebook/bart-large-cnn", | ||
framework="pt", | ||
device=device | ||
) | ||
|
||
# Run summarizer | ||
self.logger.info("Running summarizer...") | ||
result = summarize_long_text(text=text_str, summarizer=summarizer) | ||
|
||
# Return result | ||
if input_data.output_type == "xcom": | ||
self.logger.info("Summarization completed successfully. Result returned as XCom.") | ||
msg = f"Summarization completed successfully. Result returned as XCom." | ||
summary_result = result | ||
output_file_path = "" | ||
else: | ||
self.logger.info("Sumamrization completed successfully. Result returned as file.") | ||
msg = f"Summarization completed successfully. Result returned as file." | ||
summary_result = "" | ||
output_file_path = "summary_result.txt" | ||
with open(output_file_path, "w") as f: | ||
f.write(result) | ||
|
||
return OutputModel( | ||
message=msg, | ||
summary_result=summary_result, | ||
file_path=output_file_path | ||
) |
Oops, something went wrong.