Skip to content

Commit

Permalink
safekeeper: make --partial-backup-timeout respect min.
Browse files Browse the repository at this point in the history
We added jitter to --partial-backup-timeout and --eviction-min-resident
values by taking rand of them. To respect the min value semantics of
these use n + rand(n) instead.
  • Loading branch information
arssher committed Nov 12, 2024
1 parent 54a1676 commit b669e21
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 11 deletions.
2 changes: 2 additions & 0 deletions safekeeper/src/bin/safekeeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ struct Args {
#[arg(long)]
walsenders_keep_horizon: bool,
/// Controls how long backup will wait until uploading the partial segment.
/// The actual waiting time is passed value + rand of it to add some jitter.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
partial_backup_timeout: Duration,
/// Disable task to push messages to broker every second. Supposed to
Expand All @@ -195,6 +196,7 @@ struct Args {
/// if it weren't for `eviction_min_resident` preventing that.
///
/// Also defines interval for eviction retries.
/// The actual waiting time is passed value + rand of it to add some jitter.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
eviction_min_resident: Duration,
}
Expand Down
4 changes: 1 addition & 3 deletions safekeeper/src/timeline_eviction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ use crate::{
metrics::{
EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
},
rate_limit::rand_duration,
timeline_manager::{Manager, StateSnapshot},
wal_backup,
wal_backup_partial::{self, PartialRemoteSegment},
Expand Down Expand Up @@ -131,8 +130,7 @@ impl Manager {
return;
}

self.evict_not_before =
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
self.update_evict_not_before();

info!("successfully restored evicted timeline");
NUM_EVICTED_TIMELINES.dec();
Expand Down
18 changes: 13 additions & 5 deletions safekeeper/src/timeline_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,15 +315,13 @@ pub async fn main_task(
mgr.set_status(Status::EvictTimeline);
if !mgr.evict_timeline().await {
// eviction failed, try again later
mgr.evict_not_before =
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
mgr.update_evict_not_before();
update_next_event(&mut next_event, mgr.evict_not_before);
}
}
None => {
// we can't evict timeline now, will try again later
mgr.evict_not_before =
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
mgr.update_evict_not_before();
update_next_event(&mut next_event, mgr.evict_not_before);
}
}
Expand Down Expand Up @@ -430,7 +428,9 @@ impl Manager {
tli,
global_rate_limiter,
// to smooth out evictions spike after restart
evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
evict_not_before: Instant::now()
+ conf.eviction_min_resident
+ rand_duration(&conf.eviction_min_resident),
conf,
}
}
Expand Down Expand Up @@ -720,6 +720,14 @@ impl Manager {
}
}
}

pub fn update_evict_not_before(&mut self) {
// Wait at least eviction_min_resident and add randomized value of it to
// add jitter.
self.evict_not_before = Instant::now()
+ self.conf.eviction_min_resident
+ rand_duration(&self.conf.eviction_min_resident)
}
}

// utility functions
Expand Down
6 changes: 3 additions & 3 deletions safekeeper/src/wal_backup_partial.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,12 +426,12 @@ pub async fn main_task(
cancel: CancellationToken,
) -> Option<PartialRemoteSegment> {
debug!("started");
let await_duration = conf.partial_backup_timeout;
let mut first_iteration = true;

let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();

let partial_backup_timeout = conf.partial_backup_timeout;
let mut backup = PartialBackup::new(tli, conf).await;

debug!("state: {:?}", backup.state);
Expand Down Expand Up @@ -489,9 +489,9 @@ pub async fn main_task(
// if this is not the first iteration, we will wait for the full await_duration
let await_duration = if first_iteration {
first_iteration = false;
rand_duration(&await_duration)
partial_backup_timeout + rand_duration(&partial_backup_timeout)
} else {
await_duration
partial_backup_timeout
};

// fixing the segno and waiting some time to prevent reuploading the same segment too often
Expand Down

0 comments on commit b669e21

Please sign in to comment.