text summarizer local and comment action

Tauffer-Consulting · Dec 4, 2023 · ff5837e · ff5837e
1 parent e84e43c
commit ff5837e
Show file tree

Hide file tree

Showing 8 changed files with 857 additions and 534 deletions.
diff --git a/.domino/compiled_metadata.json b/.domino/compiled_metadata.json
diff --git a/.domino/dependencies_map.json b/.domino/dependencies_map.json
@@ -5,18 +5,17 @@
             "requirements_file": null
         },
         "pieces": [
-            "TextGeneratorPiece",
             "InformationExtractionPiece",
-            "AudioTranscriptionPiece",
-            "PromptCreatorForImageGeneratorPiece",
             "TextTaggingPiece",
+            "TextGeneratorPiece",
             "ImageGeneratorPiece",
-            "TextSummarizerPiece"
+            "PromptCreatorForImageGeneratorPiece",
+            "TextSummarizerPiece",
+            "AudioTranscriptionPiece"
         ],
         "secrets": [
             "OPENAI_API_KEY"
-        ],
-        "source_image": "ghcr.io/tauffer-consulting/openai_domino_pieces:0.7.1-group0"
+        ]
     },
     "group1": {
         "dependency": {
@@ -26,7 +25,16 @@
         "pieces": [
             "AudioTranscriptionLocalPiece"
         ],
-        "secrets": [],
-        "source_image": "ghcr.io/tauffer-consulting/openai_domino_pieces:0.7.1-group1"
+        "secrets": []
+    },
+    "group2": {
+        "dependency": {
+            "dockerfile": "Dockerfile_summarizer",
+            "requirements_file": null
+        },
+        "pieces": [
+            "TextSummarizerLocalPiece"
+        ],
+        "secrets": []
     }
 }
diff --git a/.github/workflows/validate-and-organize.yml b/.github/workflows/validate-and-organize.yml
@@ -1,81 +1,81 @@
-name: Validate and organize repo, build and publish Docker images
+# name: Validate and organize repo, build and publish Docker images
 
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'config.toml'
+# on:
+#   push:
+#     branches:
+#       - main
+#     paths:
+#       - 'config.toml'
 
-jobs:
-  validate-and-organize:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
-          fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
+# jobs:
+#   validate-and-organize:
+#     runs-on: ubuntu-latest
+#     steps:
+#       - name: Checkout
+#         uses: actions/checkout@v3
+#         with:
+#           persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
+#           fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
 
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
+#       - name: Set up Python 3.10
+#         uses: actions/setup-python@v3
+#         with:
+#           python-version: "3.10"
 
-      - name: Pip install packages
-        run: |
-          python -m pip install -U pip
-          pip install --no-cache-dir domino-py[cli]
+#       - name: Pip install packages
+#         run: |
+#           python -m pip install -U pip
+#           pip install --no-cache-dir domino-py[cli]
 
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+#       - name: Login to GitHub Container Registry
+#         uses: docker/login-action@v2
+#         with:
+#           registry: ghcr.io
+#           username: ${{ github.actor }}
+#           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: pip list
-        run: pip list | grep domino
+#       - name: pip list
+#         run: pip list | grep domino
 
-      - name: Run organize and build images
-        run: |
-          domino piece organize --build-images --source-url=https://github.com/${{github.repository}}
+#       - name: Run organize and build images
+#         run: |
+#           domino piece organize --build-images --source-url=https://github.com/${{github.repository}}
 
-      - name: Install Tests Dependencies
-        run: pip install -r requirements-tests.txt
+#       - name: Install Tests Dependencies
+#         run: pip install -r requirements-tests.txt
 
-      - name: Run tests over built images
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          DOMINO_TESTS_ENVIRONMENT: github
-        run: |
-          pytest -s -v --cov=pieces --cov-report=xml --cov-report=term-missing
+#       - name: Run tests over built images
+#         env:
+#           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+#           DOMINO_TESTS_ENVIRONMENT: github
+#         run: |
+#           pytest -s -v --cov=pieces --cov-report=xml --cov-report=term-missing
 
-      - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v3
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+#       - name: Upload coverage reports to Codecov
+#         uses: codecov/codecov-action@v3
+#         env:
+#           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
-      - name: Publish images
-        env:
-          GHCR_USERNAME: ${{ github.actor }}
-        run: domino piece publish-images --registry-token ${{ secrets.GITHUB_TOKEN }}
+#       - name: Publish images
+#         env:
+#           GHCR_USERNAME: ${{ github.actor }}
+#         run: domino piece publish-images --registry-token ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Commit files
-        run: |
-          git config --local user.email "github-actions[bot]@users.noreply.github.com"
-          git config --local user.name "github-actions[bot]"
-          GIT_STATUS=$(git status -s)
-          [[ ! -z "$GIT_STATUS" ]] && git add .domino/* && git commit -m "auto-organize" -a || echo "No changes to commit"
+#       - name: Commit files
+#         run: |
+#           git config --local user.email "github-actions[bot]@users.noreply.github.com"
+#           git config --local user.name "github-actions[bot]"
+#           GIT_STATUS=$(git status -s)
+#           [[ ! -z "$GIT_STATUS" ]] && git add .domino/* && git commit -m "auto-organize" -a || echo "No changes to commit"
 
-      - name: Push changes
-        uses: ad-m/github-push-action@master
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          branch: ${{ github.ref }}
+#       - name: Push changes
+#         uses: ad-m/github-push-action@master
+#         with:
+#           github_token: ${{ secrets.GITHUB_TOKEN }}
+#           branch: ${{ github.ref }}
 
-      - name: Create Release
-        env:
-          GITHUB_REPOSITORY: ${{ github.repository }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: domino piece release
+#       - name: Create Release
+#         env:
+#           GITHUB_REPOSITORY: ${{ github.repository }}
+#           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+#         run: domino piece release
diff --git a/dependencies/Dockerfile_summarizer b/dependencies/Dockerfile_summarizer
@@ -0,0 +1,18 @@
+FROM ghcr.io/tauffer-consulting/domino-base-piece:gpu
+
+# Install specific requirements
+ENV TZ=UTC
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt update
+RUN apt install ffmpeg -y
+
+# Need to copy operators source code
+COPY config.toml domino/pieces_repository/
+COPY pieces domino/pieces_repository/pieces
+COPY .domino domino/pieces_repository/.domino
+
+# Install specific python dependencies
+RUN pip install transformers==4.34.1
+
+# Dowload transformers model
+RUN python -c "import huggingface_hub; huggingface_hub.snapshot_download('facebook/bart-large-cnn')"
diff --git a/pieces/TextSummarizerLocalPiece/metadata.json b/pieces/TextSummarizerLocalPiece/metadata.json
@@ -0,0 +1,22 @@
+{
+    "name": "TextSummarizerLocalPiece",
+    "description": "Text Summarizer with local model",
+    "dependency": {
+        "dockerfile": "Dockerfile_summarizer"
+    },
+    "container_resources": {
+        "use_gpu": true,
+        "requests": {
+            "cpu": 1000,
+            "memory": 3072
+        },
+        "limits": {
+            "cpu": 5000,
+            "memory": 15360
+        }
+    },
+    "style": {
+        "node_label": "Text Summarizer Piece",
+        "icon_class_name": "fa-solid:quote-left"
+    }
+}
diff --git a/pieces/TextSummarizerLocalPiece/models.py b/pieces/TextSummarizerLocalPiece/models.py
@@ -0,0 +1,48 @@
+from pydantic import BaseModel, Field, FilePath, validators
+from typing import Union
+from enum import Enum
+
+
+class OutputTypeType(str, Enum):
+    xcom = "xcom"
+    file = "file"
+
+
+class InputModel(BaseModel):
+    """
+    Input data for TextSummarizerPiece
+    """
+    input_file_path: str = Field(
+        description='The path to the text file to summarize.',
+        default=""
+    )
+    input_text: str = Field(
+        description='The text to summarize.',
+        default=""
+    )
+    output_type: OutputTypeType = Field(
+        description='The type of output fot the result text.',
+        default=OutputTypeType.xcom
+    )
+    use_gpu: bool = Field(
+        description='Use GPU for inference.',
+        default=False
+    )
+
+
+class OutputModel(BaseModel):
+    """
+    Output data for TextSummarizerPiece
+    """
+    message: str = Field(
+        default="",
+        description="Output message to log."
+    )
+    summary_result: str = Field(
+        default="",
+        description="The result summarized text."
+    )
+    file_path: Union[FilePath, str] = Field(
+        default="",
+        description="The path to the resulting summarized text file."
+    )
diff --git a/pieces/TextSummarizerLocalPiece/piece.py b/pieces/TextSummarizerLocalPiece/piece.py
@@ -0,0 +1,86 @@
+from domino.base_piece import BasePiece
+from .models import InputModel, OutputModel
+from transformers import pipeline
+import torch
+
+
+def summarize_long_text(text: str, summarizer, iteration: int=0):
+    """
+    Generate the summary by concatenating the summaries of the individual chunks.
+    """
+    iteration += 1
+    print(f"Iteration: {iteration}")
+
+    # Preprocess text
+    text = text.lower().replace(".", " ").replace(",", " ").replace("\n", " ")
+    text = "".join(ch if ch.isalnum() or ch == " " else " " for ch in text)
+
+    # Split the input text into chunks
+    chunk_size = 1000
+    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+    print(f"chunks to process: {len(chunks)}")
+
+    # Generate the summary for each chunk
+    summary_list = [
+        summarizer(chunk, max_length=60, min_length=30, no_repeat_ngram_size=3)[0]['summary_text']
+        for chunk in chunks
+    ]
+    summary = " ".join(summary_list)
+
+    if len(summary) > 2000:
+        return summarize_long_text(summary, summarizer, iteration)
+    else:
+        return summary
+
+
+class TextSummarizerLocalPiece(BasePiece):
+
+    def piece_function(self, input_data: InputModel):
+
+        # Set device
+        if input_data.use_gpu and torch.cuda.is_available():
+            device = torch.cuda.current_device()
+            self.logger.info("Using GPU for inference.")
+        else:
+            device = -1
+            self.logger.info("Using CPU for inference.")
+
+        # Load text
+        if input_data.input_file_path:
+            with open(input_data.input_file_path, "r") as f:
+                text_str = f.read()
+        else:
+            text_str = input_data.input_text
+
+        # Load summarizer
+        self.logger.info("Loading summarizer...")
+        summarizer = pipeline(
+            task="summarization",
+            model="facebook/bart-large-cnn",
+            framework="pt",
+            device=device
+        )
+
+        # Run summarizer
+        self.logger.info("Running summarizer...")
+        result = summarize_long_text(text=text_str, summarizer=summarizer)
+
+        # Return result
+        if input_data.output_type == "xcom":
+            self.logger.info("Summarization completed successfully. Result returned as XCom.")
+            msg = f"Summarization completed successfully. Result returned as XCom."
+            summary_result = result
+            output_file_path = ""
+        else:
+            self.logger.info("Sumamrization completed successfully. Result returned as file.")
+            msg = f"Summarization completed successfully. Result returned as file."
+            summary_result = ""
+            output_file_path = "summary_result.txt"
+            with open(output_file_path, "w") as f:
+                f.write(result)
+
+        return OutputModel(
+            message=msg,
+            summary_result=summary_result,
+            file_path=output_file_path
+        )