From f9cbbcf0d1eceda5a4efb2449d20439ef3d293a9 Mon Sep 17 00:00:00 2001 From: kvpetrov Date: Thu, 26 Sep 2024 10:48:40 -0700 Subject: [PATCH] adding metrics for failover (#1856) Co-authored-by: Kier Petrov --- .../HighAvailabilityPlanner.scala | 26 +++++++++++++++++++ .../queryplanner/SingleClusterPlanner.scala | 10 ++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/coordinator/src/main/scala/filodb.coordinator/queryplanner/HighAvailabilityPlanner.scala b/coordinator/src/main/scala/filodb.coordinator/queryplanner/HighAvailabilityPlanner.scala index 56dd7231f5..5cb905575d 100644 --- a/coordinator/src/main/scala/filodb.coordinator/queryplanner/HighAvailabilityPlanner.scala +++ b/coordinator/src/main/scala/filodb.coordinator/queryplanner/HighAvailabilityPlanner.scala @@ -7,6 +7,7 @@ import scala.jdk.CollectionConverters._ import com.typesafe.scalalogging.StrictLogging import io.grpc.ManagedChannel +import kamon.Kamon import filodb.coordinator.GrpcPlanDispatcher import filodb.coordinator.ShardMapper @@ -17,6 +18,9 @@ import filodb.grpc.GrpcCommonUtils import filodb.query.{LabelNames, LabelValues, LogicalPlan, SeriesKeysByFilters} import filodb.query.exec._ +object HighAvailabilityPlanner { + final val FailoverCounterName = "single-cluster-plans-materialized" +} /** * HighAvailabilityPlanner responsible for using underlying local planner and FailureProvider * to come up with a plan that orchestrates query execution between multiple @@ -48,6 +52,24 @@ class HighAvailabilityPlanner(dsRef: DatasetRef, import QueryFailureRoutingStrategy._ import LogicalPlan._ + // legacy failover counter captures failovers when we send a PromQL to the buddy + // cluster + val legacyFailoverCounter = Kamon.counter(HighAvailabilityPlanner.FailoverCounterName) + .withTag("cluster", clusterName) + .withTag("type", "legacy") + + // full failover counter captures failovers when we materialize a plan locally and + // send an entire plan to the buddy cluster for execution + val fullFailoverCounter = Kamon.counter(HighAvailabilityPlanner.FailoverCounterName) + .withTag("cluster", clusterName) + .withTag("type", "full") + + // partial failover counter captures failovers when we materialize a plan locally and + // send some parts of it for execution to the buddy cluster + val partialFailoverCounter = Kamon.counter(HighAvailabilityPlanner.FailoverCounterName) + .withTag("cluster", clusterName) + .withTag("type", "partial") + // HTTP endpoint is still mandatory as metadata queries still use it. val remoteHttpEndpoint: String = queryConfig.remoteHttpEndpoint .getOrElse(throw new IllegalArgumentException("remoteHttpEndpoint config needed")) @@ -110,6 +132,7 @@ class HighAvailabilityPlanner(dsRef: DatasetRef, qContext) } case route: RemoteRoute => + legacyFailoverCounter.increment() val timeRange = route.timeRange.get val queryParams = qContext.origQueryParams.asInstanceOf[PromQlQueryParams] // rootLogicalPlan can be different from queryParams.promQl @@ -341,6 +364,7 @@ class HighAvailabilityPlanner(dsRef: DatasetRef, localActiveShardMapper: ActiveShardMapper, remoteActiveShardMapper: ActiveShardMapper ): ExecPlan = { + partialFailoverCounter.increment() // it makes sense to do local planning if we have at least 50% of shards running // as the query might overload the few shards we have while doing second level aggregation // Generally, it would be better to ship the entire query to the cluster that has more shards up @@ -360,6 +384,7 @@ class HighAvailabilityPlanner(dsRef: DatasetRef, buddyGrpcEndpoint = Some(remoteGrpcEndpoint.get) ) val context = qContext.copy(plannerParams = haPlannerParams) + logger.info(context.getQueryLogLine("Using shard level failover")) val plan = localPlanner.materialize(logicalPlan, context); plan } @@ -419,6 +444,7 @@ class HighAvailabilityPlanner(dsRef: DatasetRef, qContext: QueryContext, localActiveShardMapper: ActiveShardMapper, remoteActiveShardMapper: ActiveShardMapper ): GenericRemoteExec = { + fullFailoverCounter.increment() val timeout: Long = queryConfig.remoteHttpTimeoutMs.getOrElse(60000) val plannerParams = qContext.plannerParams.copy( failoverMode = filodb.core.query.ShardLevelFailoverMode, diff --git a/coordinator/src/main/scala/filodb.coordinator/queryplanner/SingleClusterPlanner.scala b/coordinator/src/main/scala/filodb.coordinator/queryplanner/SingleClusterPlanner.scala index 75cd5e851e..d457a2710a 100644 --- a/coordinator/src/main/scala/filodb.coordinator/queryplanner/SingleClusterPlanner.scala +++ b/coordinator/src/main/scala/filodb.coordinator/queryplanner/SingleClusterPlanner.scala @@ -65,6 +65,12 @@ class SingleClusterPlanner(val dataset: Dataset, private val shardColumns = dsOptions.shardKeyColumns.sorted private val dsRef = dataset.ref + // failed failover counter captures failovers which are not possible because at least one shard + // is down both on the primary and DR clusters, the query will get executed only when the + // partial results are acceptable otherwise an exception is thrown + val shardUnavailableFailoverCounter = Kamon.counter(HighAvailabilityPlanner.FailoverCounterName) + .withTag("cluster", clusterName) + .withTag("type", "shardUnavailable") val numPlansMaterialized = Kamon.counter("single-cluster-plans-materialized") .withTag("cluster", clusterName) @@ -192,10 +198,12 @@ class SingleClusterPlanner(val dataset: Dataset, if (!shardInfo.active) { if (queryContext.plannerParams.allowPartialResults) logger.debug(s"Shard: $shard is not available however query is proceeding as partial results is enabled") - else + else { + shardUnavailableFailoverCounter.increment() throw new filodb.core.query.ServiceUnavailableException( s"Remote Buddy Shard: $shard is not available" ) + } } val dispatcher = RemoteActorPlanDispatcher(shardInfo.address, clusterName) dispatcher