Skip to content

Commit

Permalink
Merge pull request #183 from jupyter-naas/178-schedule-abi-using-gith…
Browse files Browse the repository at this point in the history
…ub-action

WIP: 178 schedule abi using GitHub action
  • Loading branch information
Dr0p42 authored Jul 31, 2024
2 parents 245414c + 50604e1 commit 49f1a21
Show file tree
Hide file tree
Showing 9 changed files with 426 additions and 11 deletions.
97 changes: 97 additions & 0 deletions .github/scripts/generate_schedulers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import yaml, os
import pydash as _

template_str = """
name: CI/CD Workflow
on: {}
jobs:
scheduler:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Pull Docker image
run: docker pull ghcr.io/${{ github.repository }}/abi:latest
# - name: Run Papermill
# run: |
# docker run --name abi-execution -i --platform linux/amd64 ghcr.io/${{ github.repository }}/abi:latest ls
# mkdir output
# docker cp abi-execution:/app/__pipeline__.ipynb ./output/__pipeline__.ipynb
# - name: Upload output artifacts
# uses: actions/upload-artifact@v4
# with:
# name: output-files
# path: ./output
"""

def generate_schedulers(config : dict, template : str):
for scheduler in config["schedulers"]:
# Skip disabled schedulers
if scheduler.get("enabled", False) is False:
continue

# Load template
cicd = yaml.safe_load(template_str)
del cicd[True]

_.set_(cicd, "name", f"Scheduler - {scheduler['name']}")

cicd["on"] = {"schedule": [{"cron": scheduler["cron"]}], "workflow_dispatch": {}}


new_step = {}

new_step['name'] = scheduler['name']

new_step['run'] = f"""
# Generate unique id
export SCHEDULER_ID=$(python -c "import uuid; print(uuid.uuid4())")
# Execute the Scheduler script
docker run --name $SCHEDULER_ID -i --platform linux/amd64 ghcr.io/${{ github.repository }}/abi:latest python .github/scripts/run_scheduler.py "{scheduler['name']}"
# Create the output directory that will be used to store the output files and save them as artifacts.
mkdir -p outputs/
# Copy the output files from the container to the host.
docker cp $SCHEDULER_ID:/app/outputs ./outputs/
"""

# Append the new step to the steps list
cicd["jobs"]["scheduler"]["steps"].append(new_step)

cicd["jobs"]["scheduler"]["steps"].append({
'name': 'Upload output artifacts',
'uses': 'actions/upload-artifact@v4',
'with': {
'name': 'output-files',
'path': './outputs'
}
})

# Write to file.
# Make sure scheduler name is a valid filename.
scheduler_name = scheduler["name"].replace(" ", "_").lower()
scheduler_path = os.path.join('.github/workflows', f'scheduler__{scheduler_name}.yaml')
yaml.dump(cicd, open(scheduler_path, "w"))
print(f'✅ Scheduler file generated: {scheduler_path}')

if __name__ == "__main__":
with open("config.yml", "r") as file:
config = yaml.safe_load(file)

generate_schedulers(config, template_str)
92 changes: 92 additions & 0 deletions .github/scripts/run_scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python

# This script is used to run a scheduler defined in config.yml

import os
import sys

import papermill
import yaml
import re

class SchedulerNotFoundError(Exception):
pass


class UnknownStepTypeError(Exception):
pass

# Backing up environment variables.
environment_vars_backup: dict[str, str] = os.environ.copy()


def sanitize_string_to_filename(filename):
# Remove invalid characters
filename = re.sub(r'[\\/*?:"<>|]', '', filename)

# Replace spaces with underscores
filename = filename.replace(' ', '_')

# Remove leading/trailing whitespace
filename = filename.strip()

# Ensure filename doesn't exceed the max length
max_filename_length = 255
if len(filename) > max_filename_length:
filename = filename[:max_filename_length]

return filename.lower()

def get_scheduler(scheduler_name: str):
with open("config.yml", "r") as file:
config = yaml.safe_load(file)

for scheduler in config["schedulers"]:
if scheduler["name"] == scheduler_name:
return scheduler

raise SchedulerNotFoundError(
f"Scheduler '{scheduler_name}' not found in config.yml"
)


def reset_environment_vars():
os.environ.clear()
os.environ.update(environment_vars_backup)


def run_notebook_step(scheduler_name: str, step: dict):
reset_environment_vars()

if "environment_variables" in step:
for key, value in step["environment_variables"].items():
os.environ[key] = value

entrypoint_path = '/'.join(step["entrypoint"].split('/')[:-1])
notebook_name = step["entrypoint"].split('/')[-1]

output_path = os.path.join(f"outputs/scheduler_executions/{sanitize_string_to_filename(scheduler_name)}/{sanitize_string_to_filename(step['name'])}", entrypoint_path)
os.makedirs(output_path, exist_ok=True)

papermill.execute_notebook(
input_path=step["entrypoint"],
output_path=os.path.join(output_path, notebook_name),
parameters=step.get("inputs", {}),
)

def run_scheduler(scheduler_name: str):
scheduler = get_scheduler(scheduler_name)

for step in scheduler["steps"]:
if step.get("enabled", False) is False:
continue

if step.get("type") == "notebook":
run_notebook_step(scheduler_name, step)
else:
raise UnknownStepTypeError(f"Unknown step type: {step.get('type')}")


if __name__ == "__main__":
scheduler_name = sys.argv[1]
run_scheduler(scheduler_name)
31 changes: 31 additions & 0 deletions .github/scripts/validate_jsonschema_yaml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python

import sys
import json
import yaml
from jsonschema import validate, ValidationError

if len(sys.argv) != 3:
print("Usage: python validate_jsonschema_yaml.py <schemafile> <datafile>")
sys.exit(1)

schema_file = sys.argv[1]
data_file = sys.argv[2]

# Load JSON schema
with open(schema_file) as f:
schema = json.load(f)

# Load YAML data
with open(data_file) as f:
data = yaml.safe_load(f)

# Validate
try:
validate(instance=data, schema=schema)
except ValidationError as e:
print(e)
sys.exit(1)

print("YAML data is valid")
sys.exit(0)
55 changes: 55 additions & 0 deletions .github/workflows/build_abi_container.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Build ABI Container

on:
push:
branches:
- main
# pull_request:
# branches:
# - main

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Cache Docker layers
uses: actions/cache@v4
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build Docker image
run: |
make build
docker tag abi:latest ghcr.io/${{ github.repository }}/abi:latest
- name: Push Docker image to GitHub Container Registry
run: docker push ghcr.io/${{ github.repository }}/abi:latest

- name: Run Papermill
run: |
docker run --name abi-execution --platform linux/amd64 ghcr.io/${{ github.repository }}/abi:latest papermill tests/ci.ipynb outputs/ci.output.ipynb
mkdir execution_outputs
docker cp abi-execution:/app/outputs ./execution_outputs/
- name: Upload output artifacts
uses: actions/upload-artifact@v4
with:
name: output-files
path: ./execution_outputs
50 changes: 50 additions & 0 deletions .github/workflows/scheduler__main.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
jobs:
scheduler:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
password: ${{ secrets.GITHUB_TOKEN }}
registry: ghcr.io
username: ${{ github.actor }}
- name: Pull Docker image
run: docker pull ghcr.io/${{ github.repository }}/abi:latest
- name: main
run: '
# Generate unique id
export SCHEDULER_ID=$(python -c "import uuid; print(uuid.uuid4())")
# Execute the Scheduler script
docker run --name $SCHEDULER_ID -i --platform linux/amd64 ghcr.io/${ github.repository
}/abi:latest python .github/scripts/run_scheduler.py "main"
# Create the output directory that will be used to store the output files
and save them as artifacts.
mkdir -p outputs/
# Copy the output files from the container to the host.
docker cp $SCHEDULER_ID:/app/outputs ./outputs/
'
- name: Upload output artifacts
uses: actions/upload-artifact@v4
with:
name: output-files
path: ./outputs
name: Scheduler - main
'on':
schedule:
- cron: '*/5 * * * *'
workflow_dispatch: {}
5 changes: 4 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
{
"python.defaultInterpreterPath": ".abi-config/bin/python"
"python.defaultInterpreterPath": ".abi-config/bin/python",
"conventionalCommits.scopes": [
"CI/CD"
]
}
16 changes: 15 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ Usage:
make usage: Show this message
make build: Build the Docker image to package ABI.

make ci-generate-schedulers: Generate the scheduler files for the CI/CD
make ci-run-scheduler scheduler=<scheduler>: Run a specific scheduler


endef
export usage_str
Expand Down Expand Up @@ -120,4 +123,15 @@ windows-install-conda:
build: build.linux.x86_64

build.linux.x86_64:
docker build . -t abi -f Dockerfile.linux.x86_64 --platform linux/amd64
docker build . -t abi -f Dockerfile.linux.x86_64 --platform linux/amd64

# CI/CD
ci-generate-schedulers:
@ conda run -p .abi-conda python .github/scripts/generate_schedulers.py

ci-run-scheduler:
@ conda run -p .abi-conda python .github/scripts/run_scheduler.py $(scheduler)
# Validations

validate-config:
@ conda run -p .abi-conda python .github/scripts/validate_jsonschema_yaml.py config.schema.json config.yml
Loading

0 comments on commit 49f1a21

Please sign in to comment.