From d781c38aabd1e1f3992c327ccc55b8142910f4d0 Mon Sep 17 00:00:00 2001 From: Commelina Date: Fri, 7 Jul 2023 23:26:36 +0800 Subject: [PATCH] updates: logs, sshd opts and strict-key-check (#20) * optimize logs: only output control and redirect other nodes to file * use `try+` in nemesis * minor fixes: set `strict-host-key-checking` to true & increase sshd conc --- .github/workflows/ci.yml | 1 + .github/workflows/ci_repo_dispatch.yml | 1 + docker/admin/Dockerfile | 5 +++-- docker/base/init-ssh.sh | 2 ++ docker/control/Dockerfile | 4 +++- docker/docker-compose.yml | 4 ++++ docker/ld/Dockerfile | 3 ++- docker/node/start-server.sh | 2 +- docker/zk/Dockerfile | 2 +- src/jepsen/hstream/nemesis.clj | 15 ++++++++------- src/jepsen/husky_test.clj | 3 +-- src/jepsen/write_then_read.clj | 3 +-- 12 files changed, 28 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 864bf6a..83b4975 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,5 +56,6 @@ jobs: name: jepsen-test-result path: | store/HStream + /tmp/*.log !store/HStream/latest retention-days: 7 diff --git a/.github/workflows/ci_repo_dispatch.yml b/.github/workflows/ci_repo_dispatch.yml index 7a75636..e13ed20 100644 --- a/.github/workflows/ci_repo_dispatch.yml +++ b/.github/workflows/ci_repo_dispatch.yml @@ -54,6 +54,7 @@ jobs: name: jepsen-test-result path: | store/HStream + /tmp/*.log !store/HStream/latest - name: Post to Slack channel diff --git a/docker/admin/Dockerfile b/docker/admin/Dockerfile index 2a3dc69..ed4625a 100644 --- a/docker/admin/Dockerfile +++ b/docker/admin/Dockerfile @@ -27,10 +27,11 @@ CMD /usr/local/bin/init-ssh && \ --config-path zk:zookeeper:2181/logdevice.conf \ --enable-maintenance-manager \ --maintenance-log-snapshotting \ - --enable-safety-check-periodic-metadata-update & \ + --enable-safety-check-periodic-metadata-update \ + >> /tmp/$HOSTNAME.log 2>&1 & \ /usr/local/bin/wait-hstore && \ hadmin store nodes-config bootstrap --metadata-replicate-across node:3 && \ /usr/local/bin/wait-hservers && \ - hadmin server --host hserver-1 init && \ + hadmin server --host hserver-1 init >> /tmp/$HOSTNAME.log 2>&1 && \ echo "Bootstraped" > /var/jepsen/shared/hserver-cluster-started && \ tail -f /dev/null diff --git a/docker/base/init-ssh.sh b/docker/base/init-ssh.sh index e38c830..2365d1d 100644 --- a/docker/base/init-ssh.sh +++ b/docker/base/init-ssh.sh @@ -1,7 +1,9 @@ #!/bin/bash # Configure sshd +# We allow root login, and we increase the number of concurrent connections sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/g" /etc/ssh/sshd_config +sed -i "s/#MaxStartups 10:30:100/MaxStartups 100:30:100/g" /etc/ssh/sshd_config # Start ssh server mkdir -p /run/sshd diff --git a/docker/control/Dockerfile b/docker/control/Dockerfile index 5e4d5d6..80c7e43 100644 --- a/docker/control/Dockerfile +++ b/docker/control/Dockerfile @@ -56,6 +56,7 @@ CMD /init-ssh.sh && \ # lein with-profile write-then-read run test \ # --nodes "ld1,ld2,ld3,n1,n2,n3,n4,n5,zk" \ # --ssh-private-key "/root/.ssh/id_rsa" \ + # --strict-host-key-checking true \ # --dummy false \ # --grpc-timeout 20 \ # --rate 100 \ @@ -72,6 +73,7 @@ CMD /init-ssh.sh && \ lein with-profile husky run test \ --nodes "ld1,ld2,ld3,n1,n2,n3,n4,n5,zk" \ --ssh-private-key "/root/.ssh/id_rsa" \ + --strict-host-key-checking true \ --dummy false \ --grpc-timeout 60 \ --rate 50 \ @@ -81,7 +83,7 @@ CMD /init-ssh.sh && \ --fetching-number 25 \ --max-streams 10 \ --write-timeout 60 \ - --nemesis-interval 30 \ + --nemesis-interval 20 \ --nemesis-on true \ --max-partitions 5 && \ exit diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 1a924c3..1c9dd6c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -14,6 +14,7 @@ x-node: volumes: - "jepsen-shared:/var/jepsen/shared" - "/sys/fs/cgroup:/sys/fs/cgroup:ro" + - "/tmp:/tmp:rw" networks: - jepsen cap_add: @@ -36,6 +37,7 @@ x-ld: volumes: - "jepsen-shared:/var/jepsen/shared" - "/sys/fs/cgroup:/sys/fs/cgroup:ro" + - "/tmp:/tmp:rw" networks: - jepsen cap_add: @@ -84,6 +86,7 @@ services: volumes: - "jepsen-shared:/var/jepsen/shared" - "/sys/fs/cgroup:/sys/fs/cgroup:ro" + - "/tmp:/tmp:rw" zookeeper: container_name: jepsen-zookeeper @@ -106,6 +109,7 @@ services: volumes: - "jepsen-shared:/var/jepsen/shared" - "/sys/fs/cgroup:/sys/fs/cgroup:ro" + - "/tmp:/tmp:rw" control: container_name: jepsen-control diff --git a/docker/ld/Dockerfile b/docker/ld/Dockerfile index 76e53de..e87af87 100644 --- a/docker/ld/Dockerfile +++ b/docker/ld/Dockerfile @@ -17,4 +17,5 @@ CMD /usr/local/bin/init-ssh && \ --name $HOSTNAME \ --address $MY_IP \ --local-log-store-path /data/store \ - --num-shards 1 + --num-shards 1 \ + >> /tmp/$HOSTNAME.log 2>&1 diff --git a/docker/node/start-server.sh b/docker/node/start-server.sh index b2a440f..73b04bd 100644 --- a/docker/node/start-server.sh +++ b/docker/node/start-server.sh @@ -19,4 +19,4 @@ hstream-server \ --log-level debug \ --log-with-color \ --seed-nodes hserver-1,hserver-2,hserver-3,hserver-4,hserver-5 \ - >/tmp/node.log 2>&1 & + >>/tmp/$HOSTNAME.log 2>&1 & diff --git a/docker/zk/Dockerfile b/docker/zk/Dockerfile index 9763a6b..22ae2d2 100644 --- a/docker/zk/Dockerfile +++ b/docker/zk/Dockerfile @@ -53,4 +53,4 @@ RUN chmod +x /usr/local/bin/init-ssh EXPOSE 22 2181 2888 3888 CMD /usr/local/bin/init-ssh && \ - zkServer.sh start-foreground + zkServer.sh start-foreground >> /tmp/$HOSTNAME.log 2>&1 diff --git a/src/jepsen/hstream/nemesis.clj b/src/jepsen/hstream/nemesis.clj index f50a056..216c3c9 100644 --- a/src/jepsen/hstream/nemesis.clj +++ b/src/jepsen/hstream/nemesis.clj @@ -1,6 +1,7 @@ (ns jepsen.hstream.nemesis (:gen-class) (:require [clojure.tools.logging :refer :all] + [slingshot.slingshot :refer [throw+ try+]] [jepsen [db :as db] [cli :as cli] [checker :as checker] [client :as client] [control :as c] [generator :as gen] [independent :as independent] [nemesis :as nemesis] @@ -13,7 +14,7 @@ (defn kill-node [node] - (try (c/on node + (try+ (c/on node (c/exec* "killall" "-9" "hstream-server" "&&" "killall" @@ -24,7 +25,7 @@ (defn is-hserver-on-node-dead? [node] - (try + (try+ (let [shell-out (c/on node (c/exec* "pgrep" "-x" "hstream-server" "||" "true"))] (empty? shell-out)) @@ -32,7 +33,7 @@ (defn is-hserver-on-node-alive? [node] - (try (let [shell-out + (try+ (let [shell-out (c/on node (c/exec* "pgrep" "-x" "hstream-server" "||" "true"))] (seq shell-out)) (catch Exception e @@ -41,7 +42,7 @@ (defn restart-node [node] - (try (c/on node (c/exec* "/bin/start-server")) + (try+ (c/on node (c/exec* "/bin/start-server")) (Thread/sleep 10000) ;; It may take a while for the server to join the cluster (catch Exception e (warn "error when restarting" node ":" e)))) @@ -70,7 +71,7 @@ (let [node (rand-nth alive-nodes)] (kill-node node) (assoc op - :value (str "killed" node) + :value (str "killed " node) :node node)))) :resume-node (let [dead-nodes (find-hserver-dead-nodes test)] (if (empty? dead-nodes) @@ -78,7 +79,7 @@ (let [node (rand-nth dead-nodes)] (restart-node node) (assoc op - :value (str "restarted" node) + :value (str "restarted " node) :node node)))))) (nemesis/teardown! [_ _])))) @@ -133,7 +134,7 @@ (defn nemesis+ [] - (nemesis/compose {#{:kill-node :resume-node} (hserver-killer 2), + (nemesis/compose {#{:kill-node :resume-node} (hserver-killer 1), #{:start-slow :stop-slow} (slower), #{:start-loss :stop-loss} (losser), {:isolate-zk :start, :resume-zk :stop} (zk-nemesis)})) diff --git a/src/jepsen/husky_test.clj b/src/jepsen/husky_test.clj index 0ae5a0c..dcb3212 100644 --- a/src/jepsen/husky_test.clj +++ b/src/jepsen/husky_test.clj @@ -37,8 +37,7 @@ subscription-ack-timeout), :nemesis (local-nemesis/nemesis+), :ssh {:dummy? (:dummy opts), - :private-key-path "/root/.ssh/id_rsa", - :strict-host-key-checking false}, + :private-key-path "/root/.ssh/id_rsa"}, :checker (checker/compose {:set (local-checker/set+), :stat (checker/stats), :latency (checker/latency-graph), diff --git a/src/jepsen/write_then_read.clj b/src/jepsen/write_then_read.clj index 81bab1a..0b4745a 100644 --- a/src/jepsen/write_then_read.clj +++ b/src/jepsen/write_then_read.clj @@ -65,8 +65,7 @@ subscription-ack-timeout), :nemesis (local-nemesis/nemesis+), :ssh {:dummy? (:dummy opts), - :private-key-path "/root/.ssh/id_rsa", - :strict-host-key-checking false}, + :private-key-path "/root/.ssh/id_rsa"}, :checker (checker/compose {:set (local-checker/set+), :stat (checker/stats), :latency (checker/latency-graph),