aaronmondal · aaronmondal · Dec 6, 2023
diff --git a/.bazelrc b/.bazelrc
@@ -68,3 +68,6 @@ build:lre --define=EXECUTOR=remote
 
 # See: https://github.com/bazelbuild/bazel/issues/19714#issuecomment-1745604978
 build:lre --repo_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
+
+# Allow user-side customization.
+try-import %workspace%/.bazelrc.user
diff --git a/.github/workflows/lre.yaml b/.github/workflows/lre.yaml
@@ -38,3 +38,62 @@ jobs:
            --config=lre \
            --verbose_failures \
            //local-remote-execution/examples:hello_lre"
+
+  remote:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04]
+    name: Remote / ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: >- # v4.1.1
+          actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+
+      - name: Install Nix
+        uses: >- #v7
+          DeterminateSystems/nix-installer-action@5620eb4af6b562c53e4d4628c0b6e4f9d9ae8612
+
+      - name: Cache Nix derivations
+        uses: >- # Custom commit, last pinned at 2023-11-17.
+          DeterminateSystems/magic-nix-cache-action@a04e6275a6bea232cd04fc6f3cbf20d4cb02a3e1
+
+      - name: Start Kubernetes cluster
+        run: >
+          nix develop --impure --command
+          bash -c "cd deployment-examples/kubernetes \
+            && ./00_infra.sh \
+            && ./01_operations.sh \
+            && ./02_application.sh"
+
+      - name: Get gateway IPs
+        id: gateway-ips
+        run: |
+          echo "cache_ip=$(kubectl get gtw cache -o=jsonpath='{.status.addresses[0].value}')" >> "$GITHUB_ENV"
+          echo "scheduler_ip=$(kubectl get gtw scheduler -o=jsonpath='{.status.addresses[0].value}')" >> "$GITHUB_ENV"
+
+      - name: Print cluster state
+        run: |
+          kubectl get svc -A
+          kubectl get pod -A
+          kubectl get svc -A
+          kubectl get deployments -A
+          kubectl describe gtw
+          echo "cas"
+          kubectl logs -l app=nativelink-cas
+          echo "scheduler"
+          kubectl logs -l app=nativelink-scheduler
+          echo "worker"
+          kubectl logs -l app=nativelink-worker
+
+      - name: Build hello_lre with LRE toolchain.
+        run: >
+          nix develop --impure --command
+          bash -c "bazel run \
+            --config=lre \
+            --remote_instance_name=main \
+            --remote_cache=grpc://$cache_ip:50051 \
+            --remote_executor=grpc://$scheduler_ip:50052 \
+            --verbose_failures \
+            //local-remote-execution/examples:hello_lre"
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ __pycache__
 .DS_Store
 .pre-commit-config.yaml
 result
+.bazelrc.user
diff --git a/deployment-examples/kubernetes/.gitignore b/deployment-examples/kubernetes/.gitignore
@@ -0,0 +1 @@
+worker.json
diff --git a/deployment-examples/kubernetes/00_infra.sh b/deployment-examples/kubernetes/00_infra.sh
@@ -0,0 +1,127 @@
+# This script sets up a local development cluster. It's roughly equivalent to
+# a managed K8s setup.
+
+# For ease of development and to save disk space we pipe a local container
+# registry through to kind.
+#
+# See https://kind.sigs.k8s.io/docs/user/local-registry/.
+
+reg_name='kind-registry'
+reg_port='5001'
+if [ "$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" != 'true' ]; then
+  docker run \
+    -d --restart=always -p "127.0.0.1:${reg_port}:5000" --network bridge --name "${reg_name}" \
+    registry:2
+fi
+
+# Start a basic cluster. We use cilium's CNI and eBPF kube-proxy replacement.
+
+cat <<EOF |  kind create cluster --config -
+---
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+  - role: control-plane
+  - role: worker
+  - role: worker
+networking:
+  disableDefaultCNI: true
+  kubeProxyMode: none
+containerdConfigPatches:
+  - |-
+    [plugins."io.containerd.grpc.v1.cri".registry]
+      config_path = "/etc/containerd/certs.d"
+EOF
+
+# Enable the registry on the nodes.
+
+REGISTRY_DIR="/etc/containerd/certs.d/localhost:${reg_port}"
+for node in $(kind get nodes); do
+  docker exec "${node}" mkdir -p "${REGISTRY_DIR}"
+  cat <<EOF | docker exec -i "${node}" cp /dev/stdin "${REGISTRY_DIR}/hosts.toml"
+[host."http://${reg_name}:5000"]
+EOF
+done
+
+# Connect the registry to the cluster network.
+
+if [ "$(docker inspect -f='{{json .NetworkSettings.Networks.kind}}' "${reg_name}")" = 'null' ]; then
+  docker network connect "kind" "${reg_name}"
+fi
+
+# Advertise the registry location.
+
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: local-registry-hosting
+  namespace: kube-public
+data:
+  localRegistryHosting.v1: |
+    host: "localhost:${reg_port}"
+    help: "https://kind.sigs.k8s.io/docs/user/local-registry/"
+EOF
+
+# Prepare Gateway API CRDs. These MUST be available before we start cilium.
+
+kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/experimental-install.yaml
+
+kubectl wait --for condition=Established crd/gatewayclasses.gateway.networking.k8s.io
+kubectl wait --for condition=Established crd/gateways.gateway.networking.k8s.io
+kubectl wait --for condition=Established crd/httproutes.gateway.networking.k8s.io
+kubectl wait --for condition=Established crd/tlsroutes.gateway.networking.k8s.io
+kubectl wait --for condition=Established crd/grpcroutes.gateway.networking.k8s.io
+kubectl wait --for condition=Established crd/referencegrants.gateway.networking.k8s.io
+
+# Start cilium.
+
+helm repo add cilium https://helm.cilium.io
+
+helm upgrade \
+    --install cilium cilium/cilium \
+    --version 1.15.0-pre.3 \
+    --namespace kube-system \
+    --set k8sServiceHost=kind-control-plane \
+    --set k8sServicePort=6443 \
+    --set kubeProxyReplacement=strict \
+    --set gatewayAPI.enabled=true \
+    --wait
+
+# Set up MetalLB. Kind's nodes are containers running on the local docker
+# network. We reuse that network for LB-IPAM so that LoadBalancers are available
+# via "real" local IPs.
+
+KIND_NET_CIDR=$(docker network inspect kind -f '{{(index .IPAM.Config 0).Subnet}}')
+METALLB_IP_START=$(echo ${KIND_NET_CIDR} | sed "[email protected]/[email protected]@")
+METALLB_IP_END=$(echo ${KIND_NET_CIDR} | sed "[email protected]/[email protected]@")
+METALLB_IP_RANGE="${METALLB_IP_START}-${METALLB_IP_END}"
+
+helm install --namespace metallb-system --create-namespace \
+  --repo https://metallb.github.io/metallb metallb metallb \
+  --version 0.13.12 \
+  --wait
+
+cat <<EOF | kubectl apply -f -
+---
+apiVersion: metallb.io/v1beta1
+kind: L2Advertisement
+metadata:
+  name: l2-ip
+  namespace: metallb-system
+spec:
+  ipAddressPools:
+    - default-pool
+---
+apiVersion: metallb.io/v1beta1
+kind: IPAddressPool
+metadata:
+  name: default-pool
+  namespace: metallb-system
+spec:
+  addresses:
+    - ${METALLB_IP_RANGE}
+EOF
+
+# At this point we have a similar setup to the one that we'd get with a cloud
+# provider. Move on to `01_operations.sh` for the cluster setup.
diff --git a/deployment-examples/kubernetes/01_operations.sh b/deployment-examples/kubernetes/01_operations.sh
@@ -0,0 +1,28 @@
+# This script configures a cluster with a few standard deployments.
+
+# TODO(aaronmondal): Add Grafana, OpenTelemetry and the various other standard
+#                    deployments one would expect in a cluster.
+
+kubectl apply -f gateway.yaml
+
+IMAGE_TAG=$(nix eval .#image.imageTag --raw)
+
+$(nix build .#image --print-build-logs --verbose) \
+    && ./result \
+    | skopeo \
+      copy \
+      --dest-tls-verify=false \
+      docker-archive:/dev/stdin \
+      docker://localhost:5001/nativelink:local
+
+IMAGE_TAG=$(nix eval .#lre.imageTag --raw)
+
+echo $IMAGE_TAG
+
+$(nix build .#lre --print-build-logs --verbose) \
+    && ./result \
+    | skopeo \
+      copy \
+      --dest-tls-verify=false \
+      docker-archive:/dev/stdin \
+      docker://localhost:5001/nativelink-toolchain:local
diff --git a/deployment-examples/kubernetes/02_application.sh b/deployment-examples/kubernetes/02_application.sh
@@ -0,0 +1,15 @@
+# Get the nix derivation hash from the toolchain container, change the
+# `TOOLCHAIN_TAG` variable in the `worker.json.template` to that hash and apply
+# the configuration.
+
+KUSTOMIZE_DIR=$(git rev-parse --show-toplevel)/deployment-examples/kubernetes
+
+sed "s/__NATIVELINK_TOOLCHAIN_TAG__/$(nix eval .#lre.imageTag --raw)/g" \
+  "$KUSTOMIZE_DIR/worker.json.template" \
+  > "$KUSTOMIZE_DIR/worker.json"
+
+kubectl apply -k "$KUSTOMIZE_DIR"
+
+kubectl rollout status deploy/nativelink-cas
+kubectl rollout status deploy/nativelink-scheduler
+kubectl rollout status deploy/nativelink-worker
diff --git a/deployment-examples/kubernetes/03_delete_application.sh b/deployment-examples/kubernetes/03_delete_application.sh
@@ -0,0 +1,11 @@
+# Get the nix derivation hash from the toolchain container, change the
+# `TOOLCHAIN_TAG` variable in the `worker.json.template` to that hash and delete
+# the configuration.
+
+KUSTOMIZE_DIR=$(git rev-parse --show-toplevel)/deployment-examples/kubernetes
+
+sed "s/__NATIVELINK_TOOLCHAIN_TAG__/$(nix eval .#lre.imageTag --raw)/g" \
+  "$KUSTOMIZE_DIR/worker.json.template" \
+  > "$KUSTOMIZE_DIR/worker.json"
+
+kubectl delete -k "$KUSTOMIZE_DIR"
diff --git a/deployment-examples/kubernetes/README.md b/deployment-examples/kubernetes/README.md
@@ -0,0 +1,76 @@
+# Kubernetes example
+
+This deployment sets up a 3-container deployment with separate CAS, scheduler
+and worker. Don't use this example deployment in production. It's insecure.
+
+In this example we're using `kind` to set up the cluster and `cilium` with
+`metallb` to provide a `LoadBalancer` and `GatewayController`.
+
+First set up a local development cluster:
+
+```bash
+./00_infra.sh
+```
+
+Next start a few standard deployments. This part also builds the remote
+execution containers and makes them available to the cluster:
+
+```bash
+./01_operations.sh
+```
+
+Finally deploy NativeLink:
+
+```bash
+./02_application.sh
+```
+
+> [!TIP]
+> You can use `./03_delete_application.sh` to remove just the nativelink
+> deployments but leave the rest of the cluster intact.
+
+This demo setup creates two gateways to expose the `cas` and `scheduler`
+deployments via your local docker network:
+
+```bash
+CACHE=$(kubectl get gtw cache -o=jsonpath='{.status.addresses[0].value}')
+SCHEDULER=$(kubectl get gtw scheduler -o=jsonpath='{.status.addresses[0].value}')
+
+echo "Cache IP: $CACHE"
+echo "Scheduler IP: $SCHEDULER"
+
+# Prints something like:
+#
+# Cache IP: 172.20.255.200
+# Scheduler IP: 172.20.255.201
+```
+
+You can now pass these IPs to your bazel invocation to use the remote cache and
+executor:
+
+```bash
+bazel test \
+    --config=lre \
+    --remote_instance_name=main \
+    --remote_cache=grpc://$CACHE:50051
+    --remote_executore=grpc://$SCHEDULER:50052
+    //:dummy_test
+```
+
+> [!TIP]
+> You can add these flags to a to a `.bazelrc.user` file in the workspace root.
+> Note that you'll need to pass in explicit IPs as this file can't resolve
+> environment variables:
+> ```bash
+> # .bazelrc.user
+> build --config=lre
+> build --remote_instance_name=main
+> build --remote_cache=grpc://172.20.255.200:50051
+> build --remote_executor=grpc://172.20.255.201:50052
+> ```
+
+When you're done testing, delete the cluster:
+
+```bash
+kind delete cluster
+```