Skip to content

Commit

Permalink
ci: add rook-ceph cluster ci test
Browse files Browse the repository at this point in the history
Signed-off-by: subhamkrai <[email protected]>
  • Loading branch information
subhamkrai committed Jul 5, 2024
1 parent 2bf6beb commit 96c7815
Show file tree
Hide file tree
Showing 4 changed files with 270 additions and 0 deletions.
49 changes: 49 additions & 0 deletions .github/workflows/set_rook_ceph_cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Plugin Go test
on:
pull_request:
branches:
- main
- release-*

defaults:
run:
# reference: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#using-a-specific-shell
shell: bash --noprofile --norc -eo pipefail -x {0}

# cancel the in-progress workflow when PR is refreshed.
concurrency:
group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.head_ref || github.sha }}
cancel-in-progress: true

jobs:
rook-ceph-cluster:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- uses: actions/setup-go@v5
with:
go-version-file: go.mod

- name: consider debugging
uses: ./.github/workflows/tmate_debug
with:
use-tmate: ${{ secrets.USE_TMATE }}

- name: Setup Minikube
run: |
test/scripts/github-action-helper.sh install_minikube_with_none_driver v1.28.4
- name: print k8s cluster status
run: |
minikube status
kubectl get nodes
- name: use local disk
run: test/scripts/github-action-helper.sh use_local_disk

- name: deploy rook cluster
run: test/scripts/github-action-helper.sh deploy_rook
24 changes: 24 additions & 0 deletions .github/workflows/tmate_debug/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: "Tmate debugging tests"
description: "Setup tmate session if the test fails"
inputs:
use-tmate:
description: "boolean for enabling TMATE"
required: true
runs:
using: "composite"
steps:
- name: consider debugging
shell: bash --noprofile --norc -eo pipefail -x {0}
if: runner.debug || contains(github.event.pull_request.labels.*.name, 'debug-ci')
run: |
# Enable tmate only in the ceph-csi-operator fork, where the USE_TMATE secret is set in the repo, or if the action is re-run
if [ "$GITHUB_REPOSITORY_OWNER" = "ceph-csi-operator" ] || [ -n "${{ inputs.use-tmate }}" ] || [ "$GITHUB_RUN_ATTEMPT" -gt 1 ]; then
echo USE_TMATE=1 >> $GITHUB_ENV
fi
- name: set up tmate session for debugging
if: env.USE_TMATE
uses: mxschmitt/action-tmate@v3
with:
limit-access-to-actor: false
detached: true
64 changes: 64 additions & 0 deletions test/scripts/collect-logs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env bash

set -x

# User parameters
: "${CLUSTER_NAMESPACE:="rook-ceph"}"
: "${OPERATOR_NAMESPACE:="$CLUSTER_NAMESPACE"}"
: "${LOG_DIR:="test"}"

LOG_DIR="${LOG_DIR%/}" # remove trailing slash if necessary
mkdir -p "${LOG_DIR}"

CEPH_CMD="kubectl -n ${CLUSTER_NAMESPACE} exec deploy/rook-ceph-tools -- ceph --connect-timeout 10"

$CEPH_CMD -s >"${LOG_DIR}"/ceph-status.txt
$CEPH_CMD osd dump >"${LOG_DIR}"/ceph-osd-dump.txt
$CEPH_CMD report >"${LOG_DIR}"/ceph-report.txt

NAMESPACES=("$CLUSTER_NAMESPACE")
if [[ "$OPERATOR_NAMESPACE" != "$CLUSTER_NAMESPACE" ]]; then
NAMESPACES+=("$OPERATOR_NAMESPACE")
fi

for NAMESPACE in "${NAMESPACES[@]}"; do
# each namespace is a sub-directory for easier debugging
NS_DIR="${LOG_DIR}"/namespace-"${NAMESPACE}"
mkdir "${NS_DIR}"

# describe every one of the k8s resources in the namespace which rook commonly uses
for KIND in 'pod' 'deployment' 'job' 'daemonset' 'cm'; do
kubectl -n "$NAMESPACE" get "$KIND" -o wide >"${NS_DIR}"/"$KIND"-list.txt
for resource in $(kubectl -n "$NAMESPACE" get "$KIND" -o jsonpath='{.items[*].metadata.name}'); do
kubectl -n "$NAMESPACE" describe "$KIND" "$resource" >"${NS_DIR}"/"$KIND"-describe--"$resource".txt

# collect logs for pods along the way
if [[ "$KIND" == 'pod' ]]; then
kubectl -n "$NAMESPACE" logs --all-containers "$resource" >"${NS_DIR}"/logs--"$resource".txt
fi
done
done

# secret need `-oyaml` to read the content instead of `describe` since secrets `describe` will be encrypted.
# so keeping it in a different block.
for secret in $(kubectl -n "$NAMESPACE" get secrets -o jsonpath='{.items[*].metadata.name}'); do
kubectl -n "$NAMESPACE" get -o yaml secret "$secret" >"${NS_DIR}"/secret-describe--"$secret".txt
done

# describe every one of the custom resources in the namespace since all should be rook-related and
# they aren't captured by 'kubectl get all'
for CRD in $(kubectl get crds -o jsonpath='{.items[*].metadata.name}'); do
for resource in $(kubectl -n "$NAMESPACE" get "$CRD" -o jsonpath='{.items[*].metadata.name}'); do
crd_main_type="${CRD%%.*}" # e.g., for cephclusters.ceph.rook.io, only use 'cephclusters'
kubectl -n "$NAMESPACE" get -o yaml "$CRD" "$resource" >"${NS_DIR}"/"$crd_main_type"-describe--"$resource".txt
done
done

# do simple 'get all' calls for resources we don't often want to look at
kubectl get all -n "$NAMESPACE" -o wide >"${NS_DIR}"/all-wide.txt
kubectl get all -n "$NAMESPACE" -o yaml >"${NS_DIR}"/all-yaml.txt
done

sudo lsblk | sudo tee -a "${LOG_DIR}"/lsblk.txt
journalctl -o short-precise --dmesg >"${LOG_DIR}"/dmesg.txt
journalctl >"${LOG_DIR}"/journalctl.txt
133 changes: 133 additions & 0 deletions test/scripts/github-action-helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env bash

set -xeEo pipefail

#############
# VARIABLES #
#############
: "${FUNCTION:=${1}}"

# source https://github.com/rook/rook
function find_extra_block_dev() {
# shellcheck disable=SC2005 # redirect doesn't work with sudo, so use echo
echo "$(sudo lsblk)" >/dev/stderr # print lsblk output to stderr for debugging in case of future errors
# relevant lsblk --pairs example: (MOUNTPOINT identifies boot partition)(PKNAME is Parent dev ID)
# NAME="sda15" SIZE="106M" TYPE="part" MOUNTPOINT="/boot/efi" PKNAME="sda"
# NAME="sdb" SIZE="75G" TYPE="disk" MOUNTPOINT="" PKNAME=""
# NAME="sdb1" SIZE="75G" TYPE="part" MOUNTPOINT="/mnt" PKNAME="sdb"
boot_dev="$(sudo lsblk --noheading --list --output MOUNTPOINT,PKNAME | grep boot | awk '{print $2}')"
echo " == find_extra_block_dev(): boot_dev='$boot_dev'" >/dev/stderr # debug in case of future errors
# --nodeps ignores partitions
extra_dev="$(sudo lsblk --noheading --list --nodeps --output KNAME | grep -v loop | grep -v "$boot_dev" | head -1)"
echo " == find_extra_block_dev(): extra_dev='$extra_dev'" >/dev/stderr # debug in case of future errors
echo "$extra_dev" # output of function
}

: "${BLOCK:=$(find_extra_block_dev)}"

# source https://github.com/rook/rook
use_local_disk() {
BLOCK_DATA_PART="/dev/${BLOCK}1"
sudo apt purge snapd -y
sudo dmsetup version || true
sudo swapoff --all --verbose
sudo umount /mnt
# search for the device since it keeps changing between sda and sdb
sudo wipefs --all --force "$BLOCK_DATA_PART"
}

deploy_rook() {
kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/common.yaml
kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/crds.yaml
kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/operator.yaml
wait_for_operator_pod_to_be_ready_state
curl https://raw.githubusercontent.com/rook/rook/master/deploy/examples/cluster-test.yaml -o cluster-test.yaml
sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}|g" cluster-test.yaml
sed -i '0,/count: 1/ s/count: 1/count: 3/' cluster-test.yaml
kubectl create -f cluster-test.yaml
wait_for_three_mons
wait_for_pod_to_be_ready_state
kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/toolbox.yaml
}

wait_for_pod_to_be_ready_state() {
timeout 200 bash <<-'EOF'
until [ $(kubectl get pod -l app=rook-ceph-osd -n rook-ceph -o jsonpath='{.items[*].metadata.name}' -o custom-columns=READY:status.containerStatuses[*].ready | grep -c true) -eq 1 ]; do
echo "waiting for the pods to be in ready state"
sleep 1
done
EOF
timeout_command_exit_code
}

wait_for_operator_pod_to_be_ready_state() {
timeout 100 bash <<-'EOF'
until [ $(kubectl get pod -l app=rook-ceph-operator -n rook-ceph -o jsonpath='{.items[*].metadata.name}' -o custom-columns=READY:status.containerStatuses[*].ready | grep -c true) -eq 1 ]; do
echo "waiting for the operator to be in ready state"
sleep 1
done
EOF
timeout_command_exit_code
}

wait_for_three_mons() {
timeout 150 bash <<-'EOF'
until [ $(kubectl -n rook-ceph get deploy -l app=rook-ceph-mon,mon_canary!=true | grep rook-ceph-mon | wc -l | awk '{print $1}' ) -eq 3 ]; do
echo "$(date) waiting for three mon deployments to exist"
sleep 2
done
EOF
timeout_command_exit_code
}

timeout_command_exit_code() {
# timeout command return exit status 124 if command times out
if [ $? -eq 124 ]; then
echo "Timeout reached"
exit 1
fi
}

install_minikube_with_none_driver() {
CRICTL_VERSION="v1.28.0"
MINIKUBE_VERSION="v1.31.2"

sudo apt update
sudo apt install -y conntrack socat
curl -LO https://storage.googleapis.com/minikube/releases/$MINIKUBE_VERSION/minikube_latest_amd64.deb
sudo dpkg -i minikube_latest_amd64.deb
rm -f minikube_latest_amd64.deb

curl -LO https://github.com/Mirantis/cri-dockerd/releases/download/v0.3.4/cri-dockerd_0.3.4.3-0.ubuntu-focal_amd64.deb
sudo dpkg -i cri-dockerd_0.3.4.3-0.ubuntu-focal_amd64.deb
rm -f cri-dockerd_0.3.4.3-0.ubuntu-focal_amd64.deb

wget https://github.com/kubernetes-sigs/cri-tools/releases/download/$CRICTL_VERSION/crictl-$CRICTL_VERSION-linux-amd64.tar.gz
sudo tar zxvf crictl-$CRICTL_VERSION-linux-amd64.tar.gz -C /usr/local/bin
rm -f crictl-$CRICTL_VERSION-linux-amd64.tar.gz
sudo sysctl fs.protected_regular=0

CNI_PLUGIN_VERSION="v1.3.0"
CNI_PLUGIN_TAR="cni-plugins-linux-amd64-$CNI_PLUGIN_VERSION.tgz" # change arch if not on amd64
CNI_PLUGIN_INSTALL_DIR="/opt/cni/bin"

curl -LO "https://github.com/containernetworking/plugins/releases/download/$CNI_PLUGIN_VERSION/$CNI_PLUGIN_TAR"
sudo mkdir -p "$CNI_PLUGIN_INSTALL_DIR"
sudo tar -xf "$CNI_PLUGIN_TAR" -C "$CNI_PLUGIN_INSTALL_DIR"
rm "$CNI_PLUGIN_TAR"

export MINIKUBE_HOME=$HOME CHANGE_MINIKUBE_NONE_USER=true KUBECONFIG=$HOME/.kube/config
sudo -E minikube start --kubernetes-version="$1" --driver=none --memory 6g --cpus=2 --addons ingress --cni=calico
}

########
# MAIN #
########

FUNCTION="$1"
shift # remove function arg now that we've recorded it
# call the function with the remainder of the user-provided args
if ! $FUNCTION "$@"; then
echo "Call to $FUNCTION was not successful" >&2
exit 1
fi

0 comments on commit 96c7815

Please sign in to comment.