Skip to content

Commit

Permalink
updates: logs, sshd opts and strict-key-check (#20)
Browse files Browse the repository at this point in the history
* optimize logs: only output control and redirect other nodes to file

* use `try+` in nemesis

* minor fixes: set `strict-host-key-checking` to true & increase sshd conc
  • Loading branch information
Commelina authored Jul 7, 2023
1 parent aad992e commit d781c38
Show file tree
Hide file tree
Showing 12 changed files with 28 additions and 17 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,6 @@ jobs:
name: jepsen-test-result
path: |
store/HStream
/tmp/*.log
!store/HStream/latest
retention-days: 7
1 change: 1 addition & 0 deletions .github/workflows/ci_repo_dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
name: jepsen-test-result
path: |
store/HStream
/tmp/*.log
!store/HStream/latest
- name: Post to Slack channel
Expand Down
5 changes: 3 additions & 2 deletions docker/admin/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@ CMD /usr/local/bin/init-ssh && \
--config-path zk:zookeeper:2181/logdevice.conf \
--enable-maintenance-manager \
--maintenance-log-snapshotting \
--enable-safety-check-periodic-metadata-update & \
--enable-safety-check-periodic-metadata-update \
>> /tmp/$HOSTNAME.log 2>&1 & \
/usr/local/bin/wait-hstore && \
hadmin store nodes-config bootstrap --metadata-replicate-across node:3 && \
/usr/local/bin/wait-hservers && \
hadmin server --host hserver-1 init && \
hadmin server --host hserver-1 init >> /tmp/$HOSTNAME.log 2>&1 && \
echo "Bootstraped" > /var/jepsen/shared/hserver-cluster-started && \
tail -f /dev/null
2 changes: 2 additions & 0 deletions docker/base/init-ssh.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/bin/bash

# Configure sshd
# We allow root login, and we increase the number of concurrent connections
sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/g" /etc/ssh/sshd_config
sed -i "s/#MaxStartups 10:30:100/MaxStartups 100:30:100/g" /etc/ssh/sshd_config

# Start ssh server
mkdir -p /run/sshd
Expand Down
4 changes: 3 additions & 1 deletion docker/control/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ CMD /init-ssh.sh && \
# lein with-profile write-then-read run test \
# --nodes "ld1,ld2,ld3,n1,n2,n3,n4,n5,zk" \
# --ssh-private-key "/root/.ssh/id_rsa" \
# --strict-host-key-checking true \
# --dummy false \
# --grpc-timeout 20 \
# --rate 100 \
Expand All @@ -72,6 +73,7 @@ CMD /init-ssh.sh && \
lein with-profile husky run test \
--nodes "ld1,ld2,ld3,n1,n2,n3,n4,n5,zk" \
--ssh-private-key "/root/.ssh/id_rsa" \
--strict-host-key-checking true \
--dummy false \
--grpc-timeout 60 \
--rate 50 \
Expand All @@ -81,7 +83,7 @@ CMD /init-ssh.sh && \
--fetching-number 25 \
--max-streams 10 \
--write-timeout 60 \
--nemesis-interval 30 \
--nemesis-interval 20 \
--nemesis-on true \
--max-partitions 5 && \
exit
4 changes: 4 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ x-node:
volumes:
- "jepsen-shared:/var/jepsen/shared"
- "/sys/fs/cgroup:/sys/fs/cgroup:ro"
- "/tmp:/tmp:rw"
networks:
- jepsen
cap_add:
Expand All @@ -36,6 +37,7 @@ x-ld:
volumes:
- "jepsen-shared:/var/jepsen/shared"
- "/sys/fs/cgroup:/sys/fs/cgroup:ro"
- "/tmp:/tmp:rw"
networks:
- jepsen
cap_add:
Expand Down Expand Up @@ -84,6 +86,7 @@ services:
volumes:
- "jepsen-shared:/var/jepsen/shared"
- "/sys/fs/cgroup:/sys/fs/cgroup:ro"
- "/tmp:/tmp:rw"

zookeeper:
container_name: jepsen-zookeeper
Expand All @@ -106,6 +109,7 @@ services:
volumes:
- "jepsen-shared:/var/jepsen/shared"
- "/sys/fs/cgroup:/sys/fs/cgroup:ro"
- "/tmp:/tmp:rw"

control:
container_name: jepsen-control
Expand Down
3 changes: 2 additions & 1 deletion docker/ld/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ CMD /usr/local/bin/init-ssh && \
--name $HOSTNAME \
--address $MY_IP \
--local-log-store-path /data/store \
--num-shards 1
--num-shards 1 \
>> /tmp/$HOSTNAME.log 2>&1
2 changes: 1 addition & 1 deletion docker/node/start-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ hstream-server \
--log-level debug \
--log-with-color \
--seed-nodes hserver-1,hserver-2,hserver-3,hserver-4,hserver-5 \
>/tmp/node.log 2>&1 &
>>/tmp/$HOSTNAME.log 2>&1 &
2 changes: 1 addition & 1 deletion docker/zk/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ RUN chmod +x /usr/local/bin/init-ssh
EXPOSE 22 2181 2888 3888

CMD /usr/local/bin/init-ssh && \
zkServer.sh start-foreground
zkServer.sh start-foreground >> /tmp/$HOSTNAME.log 2>&1
15 changes: 8 additions & 7 deletions src/jepsen/hstream/nemesis.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
(ns jepsen.hstream.nemesis
(:gen-class)
(:require [clojure.tools.logging :refer :all]
[slingshot.slingshot :refer [throw+ try+]]
[jepsen [db :as db] [cli :as cli] [checker :as checker]
[client :as client] [control :as c] [generator :as gen]
[independent :as independent] [nemesis :as nemesis]
Expand All @@ -13,7 +14,7 @@

(defn kill-node
[node]
(try (c/on node
(try+ (c/on node
(c/exec* "killall"
"-9" "hstream-server"
"&&" "killall"
Expand All @@ -24,15 +25,15 @@

(defn is-hserver-on-node-dead?
[node]
(try
(try+
(let [shell-out (c/on node
(c/exec* "pgrep" "-x" "hstream-server" "||" "true"))]
(empty? shell-out))
(catch Exception e (warn "error when checking death on" node ":" e) true)))

(defn is-hserver-on-node-alive?
[node]
(try (let [shell-out
(try+ (let [shell-out
(c/on node (c/exec* "pgrep" "-x" "hstream-server" "||" "true"))]
(seq shell-out))
(catch Exception e
Expand All @@ -41,7 +42,7 @@

(defn restart-node
[node]
(try (c/on node (c/exec* "/bin/start-server"))
(try+ (c/on node (c/exec* "/bin/start-server"))
(Thread/sleep 10000) ;; It may take a while for the server to join the cluster
(catch Exception e (warn "error when restarting" node ":" e))))

Expand Down Expand Up @@ -70,15 +71,15 @@
(let [node (rand-nth alive-nodes)]
(kill-node node)
(assoc op
:value (str "killed" node)
:value (str "killed " node)
:node node))))
:resume-node (let [dead-nodes (find-hserver-dead-nodes test)]
(if (empty? dead-nodes)
(assoc op :value "restarting skipped")
(let [node (rand-nth dead-nodes)]
(restart-node node)
(assoc op
:value (str "restarted" node)
:value (str "restarted " node)
:node node))))))
(nemesis/teardown! [_ _]))))

Expand Down Expand Up @@ -133,7 +134,7 @@

(defn nemesis+
[]
(nemesis/compose {#{:kill-node :resume-node} (hserver-killer 2),
(nemesis/compose {#{:kill-node :resume-node} (hserver-killer 1),
#{:start-slow :stop-slow} (slower),
#{:start-loss :stop-loss} (losser),
{:isolate-zk :start, :resume-zk :stop} (zk-nemesis)}))
Expand Down
3 changes: 1 addition & 2 deletions src/jepsen/husky_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@
subscription-ack-timeout),
:nemesis (local-nemesis/nemesis+),
:ssh {:dummy? (:dummy opts),
:private-key-path "/root/.ssh/id_rsa",
:strict-host-key-checking false},
:private-key-path "/root/.ssh/id_rsa"},
:checker (checker/compose {:set (local-checker/set+),
:stat (checker/stats),
:latency (checker/latency-graph),
Expand Down
3 changes: 1 addition & 2 deletions src/jepsen/write_then_read.clj
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@
subscription-ack-timeout),
:nemesis (local-nemesis/nemesis+),
:ssh {:dummy? (:dummy opts),
:private-key-path "/root/.ssh/id_rsa",
:strict-host-key-checking false},
:private-key-path "/root/.ssh/id_rsa"},
:checker (checker/compose {:set (local-checker/set+),
:stat (checker/stats),
:latency (checker/latency-graph),
Expand Down

0 comments on commit d781c38

Please sign in to comment.