From ef722e92b900c9976af5733bc135e2e88e26fef9 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 26 Aug 2024 16:44:34 +0200 Subject: [PATCH] pageserver: ensure local disk state is durable during startup refs https://github.com/neondatabase/neon/issues/6989 Problem ------- After unclean shutdown, we get restarted and read the local filesystem to make decisions on those reads. Some of the data might have not yet been fsynced when the unclean shutdown completed. Durability matters even though Pageservers are conceptually just a cache of state in S3. For example: - the cloud control plane is no control loop => pageserver responses to tenant attachmentm, etc, needs to be durable. - the storage controller does not rely on this (as much?) - we don't have layer file checksumming, so, downloaded+renamed but not fsynced layer files are technically not to be trusted - https://github.com/neondatabase/neon/issues/2683 Solution -------- `syncfs` the tenants directory during startup, before we start reading from it. This is a bit overkill because we do remove some temp files (InMemoryLayer!) later during startup. Further, these temp files are particularly likely to be dirty in the kernel page cache. However, we don't want to refactor that cleanup code right now, and the dirty data on pageservers is generally not that high. Last, with [direct IO](https://github.com/neondatabase/neon/issues/8130) we're going to have near-zero kernel page cache anyway quite soon. --- Cargo.toml | 2 +- pageserver/src/bin/pageserver.rs | 49 ++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 963841e340979..e038c0b4ffc3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -113,7 +113,7 @@ md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.8" -nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index da0c11d9bf0ab..594fed5c18d81 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -126,10 +126,53 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); + // The tenants directory contains all the pageserver local disk state. + // Create if not exists and make sure all the contents are durable before proceeding. + // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown. + // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not. + // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error. let tenants_path = conf.tenants_path(); - if !tenants_path.exists() { - utils::crashsafe::create_dir_all(conf.tenants_path()) - .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?; + { + let open = || { + nix::dir::Dir::open( + tenants_path.as_std_path(), + nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY, + nix::sys::stat::Mode::empty(), + ) + }; + let dirfd = match open() { + Ok(dirfd) => dirfd, + Err(e) => match e { + nix::errno::Errno::ENOENT => { + utils::crashsafe::create_dir_all(&tenants_path).with_context(|| { + format!("Failed to create tenants root dir at '{tenants_path}'") + })?; + open().context("open tenants dir after creating it")? + } + e => anyhow::bail!(e), + }, + }; + + let started = Instant::now(); + // Linux guarantees durability for syncfs. + // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). + #[cfg(target_os = "linux")] + nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?; + #[cfg(target_os = "macos")] + { + // macOS is not a production platform for Neon, don't even bother. + drop(dirfd); + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + compile_error!("Unsupported OS"); + } + + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "made tenant directory contents durable" + ); } // Initialize up failpoints support