From b409dc462203ca23c4fcf1632996011ad2214175 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 12 Sep 2024 21:42:06 +0100 Subject: [PATCH 1/4] s/ControlPlaneClient/ControllerUpcallClient/ --- pageserver/src/bin/pageserver.rs | 4 ++-- pageserver/src/control_plane_client.rs | 13 ++++++++----- pageserver/src/tenant/mgr.rs | 6 +++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d15a0e47a42a..65fc5e41976c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use pageserver::config::PageserverIdentity; -use pageserver::control_plane_client::ControlPlaneClient; +use pageserver::control_plane_client::ControllerUpcallClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; @@ -396,7 +396,7 @@ fn start_pageserver( // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), - ControlPlaneClient::new(conf, &shutdown_pageserver), + ControllerUpcallClient::new(conf, &shutdown_pageserver), conf, ); if let Some(deletion_workers) = deletion_workers { diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index f6d1c35a8ce1..3b0e877f2608 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -17,9 +17,12 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; use pageserver_api::config::NodeMetadata; -/// The Pageserver's client for using the control plane API: this is a small subset -/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) -pub struct ControlPlaneClient { +/// The Pageserver's client for using the storage controller upcall API: this is a small API +/// for dealing with generations (see docs/rfcs/025-generation-numbers.md). +/// +/// The server presenting this API may either be the storage controller or some other +/// service (such as the Neon control plane) providing a store of generation numbers. +pub struct ControllerUpcallClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, @@ -45,7 +48,7 @@ pub trait ControlPlaneGenerationsApi { ) -> impl Future, RetryForeverError>> + Send; } -impl ControlPlaneClient { +impl ControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { @@ -114,7 +117,7 @@ impl ControlPlaneClient { } } -impl ControlPlaneGenerationsApi for ControlPlaneClient { +impl ControlPlaneGenerationsApi for ControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down async fn re_attach( &self, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 2104f415319e..e81a318e3875 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -31,7 +31,7 @@ use utils::{backoff, completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::control_plane_client::{ - ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, + ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; @@ -122,7 +122,7 @@ pub(crate) enum ShardSelector { Known(ShardIndex), } -/// A convenience for use with the re_attach ControlPlaneClient function: rather +/// A convenience for use with the re_attach ControllerUpcallClient function: rather /// than the serializable struct, we build this enum that encapsulates /// the invariant that attached tenants always have generations. /// @@ -341,7 +341,7 @@ async fn init_load_generations( "Emergency mode! Tenants will be attached unsafely using their last known generation" ); emergency_generations(tenant_confs) - } else if let Some(client) = ControlPlaneClient::new(conf, cancel) { + } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) { info!("Calling control plane API to re-attach tenants"); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { From 4d5cab76b3ce928c3c2680a6be84c52217c3e526 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 12 Sep 2024 21:42:37 +0100 Subject: [PATCH 2/4] mv control_plane_client controller_upcall_client --- pageserver/src/bin/pageserver.rs | 2 +- ...plane_client.rs => controller_upcall_client.rs} | 0 pageserver/src/deletion_queue.rs | 8 ++++---- pageserver/src/deletion_queue/validator.rs | 14 +++++++------- pageserver/src/lib.rs | 2 +- pageserver/src/tenant/mgr.rs | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) rename pageserver/src/{control_plane_client.rs => controller_upcall_client.rs} (100%) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 65fc5e41976c..e9e52acee67c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use pageserver::config::PageserverIdentity; -use pageserver::control_plane_client::ControllerUpcallClient; +use pageserver::controller_upcall_client::ControllerUpcallClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/controller_upcall_client.rs similarity index 100% rename from pageserver/src/control_plane_client.rs rename to pageserver/src/controller_upcall_client.rs diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 22f7d5b8242d..73bdc9021309 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use crate::control_plane_client::ControlPlaneGenerationsApi; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; @@ -622,7 +622,7 @@ impl DeletionQueue { /// If remote_storage is None, then the returned workers will also be None. pub fn new( remote_storage: GenericRemoteStorage, - control_plane_client: Option, + controller_upcall_client: Option, conf: &'static PageServerConf, ) -> (Self, Option>) where @@ -662,7 +662,7 @@ impl DeletionQueue { conf, backend_rx, executor_tx, - control_plane_client, + controller_upcall_client, lsn_table.clone(), cancel.clone(), ), @@ -704,7 +704,7 @@ mod test { use tokio::task::JoinHandle; use crate::{ - control_plane_client::RetryForeverError, + controller_upcall_client::RetryForeverError, repository::Key, tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index d215fd2b7d2d..1d55581ebd80 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -25,8 +25,8 @@ use tracing::info; use tracing::warn; use crate::config::PageServerConf; -use crate::control_plane_client::ControlPlaneGenerationsApi; -use crate::control_plane_client::RetryForeverError; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::controller_upcall_client::RetryForeverError; use crate::metrics; use crate::virtual_file::MaybeFatalIo; @@ -61,7 +61,7 @@ where tx: tokio::sync::mpsc::Sender, // Client for calling into control plane API for validation of deletes - control_plane_client: Option, + controller_upcall_client: Option, // DeletionLists which are waiting generation validation. Not safe to // execute until [`validate`] has processed them. @@ -94,7 +94,7 @@ where conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, - control_plane_client: Option, + controller_upcall_client: Option, lsn_table: Arc>, cancel: CancellationToken, ) -> Self { @@ -102,7 +102,7 @@ where conf, rx, tx, - control_plane_client, + controller_upcall_client, lsn_table, pending_lists: Vec::new(), validated_lists: Vec::new(), @@ -145,8 +145,8 @@ where return Ok(()); } - let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client { - match control_plane_client + let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client { + match controller_upcall_client .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) .await { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 7a9cf495c7dd..08abfbd64740 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -6,7 +6,7 @@ pub mod basebackup; pub mod config; pub mod consumption_metrics; pub mod context; -pub mod control_plane_client; +pub mod controller_upcall_client; pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index e81a318e3875..f6249056d830 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -30,7 +30,7 @@ use utils::{backoff, completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::control_plane_client::{ +use crate::controller_upcall_client::{ ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; From 753d86830002beadec40a00e1e387a1993c614d1 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 12 Sep 2024 21:54:44 +0100 Subject: [PATCH 3/4] chunk validate requests --- pageserver/src/controller_upcall_client.rs | 49 ++++++++++++---------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 3b0e877f2608..84daab7a5c26 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -219,29 +219,36 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { .join("validate") .expect("Failed to build validate path"); - let request = ValidateRequest { - tenants: tenants - .into_iter() - .map(|(id, gen)| ValidateRequestTenant { - id, - gen: gen - .into() - .expect("Generation should always be valid for a Tenant doing deletions"), - }) - .collect(), - }; + let mut result: HashMap = HashMap::with_capacity(tenants.len()); + + for tenant_chunk in (tenants).chunks(128) { + let request = ValidateRequest { + tenants: tenant_chunk + .iter() + .map(|(id, generation)| ValidateRequestTenant { + id: *id, + gen: (*generation).into().expect( + "Generation should always be valid for a Tenant doing deletions", + ), + }) + .collect(), + }; - failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); - if self.cancel.is_cancelled() { - return Err(RetryForeverError::ShuttingDown); - } + failpoint_support::sleep_millis_async!( + "control-plane-client-validate-sleep", + &self.cancel + ); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } - let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; + let response: ValidateResponse = + self.retry_http_forever(&re_attach_path, request).await?; + for rt in response.tenants { + result.insert(rt.id, rt.valid); + } + } - Ok(response - .tenants - .into_iter() - .map(|rt| (rt.id, rt.valid)) - .collect()) + Ok(result.into_iter().collect()) } } From 79cf2a86ae8809d7eeb58d315f3de63a68a2ebff Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 24 Sep 2024 13:47:02 +0100 Subject: [PATCH 4/4] Update pageserver/src/controller_upcall_client.rs Co-authored-by: Christian Schwarz --- pageserver/src/controller_upcall_client.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 84daab7a5c26..917f7265ad27 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -219,8 +219,10 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { .join("validate") .expect("Failed to build validate path"); + // When sending validate requests, break them up into chunks so that we + // avoid possible edge cases of generating any HTTP requests that + // require database I/O across many thousands of tenants. let mut result: HashMap = HashMap::with_capacity(tenants.len()); - for tenant_chunk in (tenants).chunks(128) { let request = ValidateRequest { tenants: tenant_chunk