From b669e21205e948f79834e611825d3816b5e448ab Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 11 Nov 2024 18:36:48 +0300 Subject: [PATCH] safekeeper: make --partial-backup-timeout respect min. We added jitter to --partial-backup-timeout and --eviction-min-resident values by taking rand of them. To respect the min value semantics of these use n + rand(n) instead. --- safekeeper/src/bin/safekeeper.rs | 2 ++ safekeeper/src/timeline_eviction.rs | 4 +--- safekeeper/src/timeline_manager.rs | 18 +++++++++++++----- safekeeper/src/wal_backup_partial.rs | 6 +++--- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 1248428d3393..69f53206e30f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -171,6 +171,7 @@ struct Args { #[arg(long)] walsenders_keep_horizon: bool, /// Controls how long backup will wait until uploading the partial segment. + /// The actual waiting time is passed value + rand of it to add some jitter. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] partial_backup_timeout: Duration, /// Disable task to push messages to broker every second. Supposed to @@ -195,6 +196,7 @@ struct Args { /// if it weren't for `eviction_min_resident` preventing that. /// /// Also defines interval for eviction retries. + /// The actual waiting time is passed value + rand of it to add some jitter. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] eviction_min_resident: Duration, } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 303421c83758..4273f0c7708c 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -18,7 +18,6 @@ use crate::{ metrics::{ EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES, }, - rate_limit::rand_duration, timeline_manager::{Manager, StateSnapshot}, wal_backup, wal_backup_partial::{self, PartialRemoteSegment}, @@ -131,8 +130,7 @@ impl Manager { return; } - self.evict_not_before = - tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident); + self.update_evict_not_before(); info!("successfully restored evicted timeline"); NUM_EVICTED_TIMELINES.dec(); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 79200fff8d84..b760647d2be2 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -315,15 +315,13 @@ pub async fn main_task( mgr.set_status(Status::EvictTimeline); if !mgr.evict_timeline().await { // eviction failed, try again later - mgr.evict_not_before = - Instant::now() + rand_duration(&mgr.conf.eviction_min_resident); + mgr.update_evict_not_before(); update_next_event(&mut next_event, mgr.evict_not_before); } } None => { // we can't evict timeline now, will try again later - mgr.evict_not_before = - Instant::now() + rand_duration(&mgr.conf.eviction_min_resident); + mgr.update_evict_not_before(); update_next_event(&mut next_event, mgr.evict_not_before); } } @@ -430,7 +428,9 @@ impl Manager { tli, global_rate_limiter, // to smooth out evictions spike after restart - evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident), + evict_not_before: Instant::now() + + conf.eviction_min_resident + + rand_duration(&conf.eviction_min_resident), conf, } } @@ -720,6 +720,14 @@ impl Manager { } } } + + pub fn update_evict_not_before(&mut self) { + // Wait at least eviction_min_resident and add randomized value of it to + // add jitter. + self.evict_not_before = Instant::now() + + self.conf.eviction_min_resident + + rand_duration(&self.conf.eviction_min_resident) + } } // utility functions diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index bddfca50e4fb..eaeb91ddc8ae 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -426,12 +426,12 @@ pub async fn main_task( cancel: CancellationToken, ) -> Option { debug!("started"); - let await_duration = conf.partial_backup_timeout; let mut first_iteration = true; let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); + let partial_backup_timeout = conf.partial_backup_timeout; let mut backup = PartialBackup::new(tli, conf).await; debug!("state: {:?}", backup.state); @@ -489,9 +489,9 @@ pub async fn main_task( // if this is not the first iteration, we will wait for the full await_duration let await_duration = if first_iteration { first_iteration = false; - rand_duration(&await_duration) + partial_backup_timeout + rand_duration(&partial_backup_timeout) } else { - await_duration + partial_backup_timeout }; // fixing the segno and waiting some time to prevent reuploading the same segment too often