AcalaNetwork · xlc · May 10, 2024 · May 10, 2024 · May 10, 2024 · May 10, 2024
diff --git a/src/extensions/client/endpoint.rs b/src/extensions/client/endpoint.rs
@@ -20,8 +20,10 @@ pub struct Endpoint {
     url: String,
     health: Arc<Health>,
     client_rx: tokio::sync::watch::Receiver<Option<Arc<Client>>>,
+    reconnect_tx: tokio::sync::mpsc::Sender<()>,
     on_client_ready: Arc<tokio::sync::Notify>,
     background_tasks: Vec<tokio::task::JoinHandle<()>>,
+    connect_counter: Arc<AtomicU32>,
 }
 
 impl Drop for Endpoint {
@@ -38,19 +40,23 @@ impl Endpoint {
         health_config: HealthCheckConfig,
     ) -> Self {
         let (client_tx, client_rx) = tokio::sync::watch::channel(None);
+        let (reconnect_tx, mut reconnect_rx) = tokio::sync::mpsc::channel(1);
         let on_client_ready = Arc::new(tokio::sync::Notify::new());
         let health = Arc::new(Health::new(url.clone(), health_config));
+        let connect_counter = Arc::new(AtomicU32::new(0));
 
         let url_ = url.clone();
         let health_ = health.clone();
         let on_client_ready_ = on_client_ready.clone();
+        let connect_counter_ = connect_counter.clone();
 
         // This task will try to connect to the endpoint and keep the connection alive
         let connection_task = tokio::spawn(async move {
             let connect_backoff_counter = Arc::new(AtomicU32::new(0));
 
             loop {
                 tracing::info!("Connecting endpoint: {url_}");
+                connect_counter_.fetch_add(1, Ordering::Relaxed);
 
                 let client = WsClientBuilder::default()
                     .request_timeout(request_timeout.unwrap_or(Duration::from_secs(30)))
@@ -68,7 +74,15 @@ impl Endpoint {
                         on_client_ready_.notify_waiters();
                         tracing::info!("Endpoint connected: {url_}");
                         connect_backoff_counter.store(0, Ordering::Relaxed);
-                        client.on_disconnect().await;
+
+                        tokio::select! {
+                            _ = reconnect_rx.recv() => {
+                                tracing::debug!("Endpoint reconnect requested: {url_}");
+                            },
+                            _ = client.on_disconnect() => {
+                                tracing::debug!("Endpoint disconnected: {url_}");
+                            }
+                        }
                     }
                     Err(err) => {
                         health_.on_error(&err);
@@ -88,8 +102,10 @@ impl Endpoint {
             url,
             health,
             client_rx,
+            reconnect_tx,
             on_client_ready,
             background_tasks: vec![connection_task, health_checker],
+            connect_counter,
         }
     }
 
@@ -108,6 +124,10 @@ impl Endpoint {
         self.on_client_ready.notified().await;
     }
 
+    pub fn connect_counter(&self) -> u32 {
+        self.connect_counter.load(Ordering::Relaxed)
+    }
+
     pub async fn request(
         &self,
         method: &str,
@@ -165,4 +185,9 @@ impl Endpoint {
             }
         }
     }
+
+    pub async fn reconnect(&self) {
+        // notify the client to reconnect
+        self.reconnect_tx.send(()).await.unwrap();
+    }
 }
diff --git a/src/extensions/client/mod.rs b/src/extensions/client/mod.rs
@@ -261,13 +261,17 @@ impl Client {
                 }
                 // wait for at least one endpoint to connect
                 futures::future::select_all(endpoints.iter().map(|x| x.connected().boxed())).await;
-                // Sort by health score
-                endpoints.sort_by_key(|endpoint| std::cmp::Reverse(endpoint.health().score()));
-                // Pick the first one
-                endpoints[0].clone()
+
+                endpoints
+                    .iter()
+                    .max_by_key(|endpoint| endpoint.health().score())
+                    .expect("No endpoints")
+                    .clone()
             };
 
-            let mut selected_endpoint = healthiest_endpoint(None).await;
+            let mut selected_endpoint = endpoints[0].clone();
+
+            selected_endpoint.connected().await;
 
             let handle_message = |message: Message, endpoint: Arc<Endpoint>, rotation_notify: Arc<Notify>| {
                 let tx = message_tx_bg.clone();
@@ -422,6 +426,10 @@ impl Client {
                     _ = selected_endpoint.health().unhealthy() => {
                         // Current selected endpoint is unhealthy, try to rotate to another one.
                         // In case of all endpoints are unhealthy, we don't want to keep rotating but stick with the healthiest one.
+
+                        // The ws client maybe in a state that requires a reconnect
+                        selected_endpoint.reconnect().await;
+
                         let new_selected_endpoint = healthiest_endpoint(None).await;
                         if new_selected_endpoint.url() != selected_endpoint.url() {
                             tracing::warn!("Switch to endpoint: {new_url}", new_url=new_selected_endpoint.url());

diff --git a/src/extensions/client/tests.rs b/src/extensions/client/tests.rs
@@ -290,3 +290,46 @@ async fn health_check_works() {
     handle1.stop().unwrap();
     handle2.stop().unwrap();
 }
+
+#[tokio::test]
+async fn reconnect_on_disconnect() {
+    let (addr1, handle1, mut rx1, _) = dummy_server().await;
+    let (addr2, handle2, mut rx2, _) = dummy_server().await;
+
+    let client = Client::new(
+        [format!("ws://{addr1}"), format!("ws://{addr2}")],
+        Some(Duration::from_millis(100)),
+        None,
+        Some(2),
+        None,
+    )
+    .unwrap();
+
+    let h1 = tokio::spawn(async move {
+        let _req = rx1.recv().await.unwrap();
+        // no response, let it timeout
+        tokio::time::sleep(Duration::from_millis(200)).await;
+    });
+
+    let h2 = tokio::spawn(async move {
+        let req = rx2.recv().await.unwrap();
+        req.respond(json!(1));
+    });
+
+    let h3 = tokio::spawn(async move {
+        let res = client.request("mock_rpc", vec![]).await;
+        assert_eq!(res.unwrap(), json!(1));
+
+        tokio::time::sleep(Duration::from_millis(2000)).await;
+
+        assert_eq!(client.endpoints()[0].connect_counter(), 2);
+        assert_eq!(client.endpoints()[1].connect_counter(), 1);
+    });
+
+    h3.await.unwrap();
+    h1.await.unwrap();
+    h2.await.unwrap();
+
+    handle1.stop().unwrap();
+    handle2.stop().unwrap();
+}