From fa815b7ecd114af2a7d42efff2d0ca6ac3dede47 Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Mon, 7 Oct 2024 16:26:10 -0300 Subject: [PATCH 1/5] Reworked declare incident escalation step (#5130) Reworked https://github.com/grafana/oncall/pull/5047. Main update is the switch from FK to a [M2M relation](https://docs.google.com/document/d/1HeulqxoFShSHtInQrZNJLL5MDlHPNT50rVGaK3zZWvw/edit?disco=AAABVLjV4W8) (which doesn't really change the original/intended behavior, besides not needing to alter the alert group table, and it is a bit more flexible; the extra table shouldn't introduce issues because this is used only for tracking purposes and the information needed in the log record is already there). Avoid a db migration involving alert group table: ``` -- -- Create model RelatedIncident -- CREATE TABLE `alerts_relatedincident` (`id` bigint AUTO_INCREMENT NOT NULL PRIMARY KEY, `incident_id` varchar(50) NOT NULL, `created_at` datetime(6) NOT NULL, `is_active` bool NOT NULL, `channel_filter_id` bigint NULL, `organization_id` bigint NOT NULL); CREATE TABLE `alerts_relatedincident_attached_alert_groups` (`id` bigint AUTO_INCREMENT NOT NULL PRIMARY KEY, `relatedincident_id` bigint NOT NULL, `alertgroup_id` bigint NOT NULL); ALTER TABLE `alerts_relatedincident` ADD CONSTRAINT `alerts_relatedincident_organization_id_incident_id_d7fc9a4f_uniq` UNIQUE (`organization_id`, `incident_id`); ALTER TABLE `alerts_relatedincident` ADD CONSTRAINT `alerts_relatedincide_channel_filter_id_9556c836_fk_alerts_ch` FOREIGN KEY (`channel_filter_id`) REFERENCES `alerts_channelfilter` (`id`); ALTER TABLE `alerts_relatedincident` ADD CONSTRAINT `alerts_relatedincide_organization_id_74ed6bed_fk_user_mana` FOREIGN KEY (`organization_id`) REFERENCES `user_management_organization` (`id`); CREATE INDEX `alerts_relatedincident_incident_id_8356a799` ON `alerts_relatedincident` (`incident_id`); ALTER TABLE `alerts_relatedincident_attached_alert_groups` ADD CONSTRAINT `alerts_relatedincident_a_relatedincident_id_alert_3d683baa_uniq` UNIQUE (`relatedincident_id`, `alertgroup_id`); ALTER TABLE `alerts_relatedincident_attached_alert_groups` ADD CONSTRAINT `alerts_relatedincide_relatedincident_id_3e5e7a23_fk_alerts_re` FOREIGN KEY (`relatedincident_id`) REFERENCES `alerts_relatedincident` (`id`); ALTER TABLE `alerts_relatedincident_attached_alert_groups` ADD CONSTRAINT `alerts_relatedincide_alertgroup_id_0125deca_fk_alerts_al` FOREIGN KEY (`alertgroup_id`) REFERENCES `alerts_alertgroup` (`id`); ``` --- .../escalation_policy_snapshot.py | 29 ++ .../alerts/migrations/0060_relatedincident.py | 30 ++ engine/apps/alerts/models/__init__.py | 1 + engine/apps/alerts/models/alert_group.py | 2 + .../alerts/models/alert_group_log_record.py | 55 ++- .../apps/alerts/models/escalation_policy.py | 3 + engine/apps/alerts/models/related_incident.py | 48 +++ engine/apps/alerts/tasks/__init__.py | 1 + engine/apps/alerts/tasks/declare_incident.py | 148 ++++++++ engine/apps/alerts/tests/factories.py | 6 + .../tests/test_escalation_policy_snapshot.py | 49 +++ .../alerts/tests/test_related_incident.py | 332 ++++++++++++++++++ engine/apps/alerts/utils.py | 12 + .../apps/api/serializers/escalation_policy.py | 9 + engine/apps/api/tests/test_alert_group.py | 31 ++ .../apps/api/tests/test_escalation_policy.py | 95 +++++ engine/apps/api/views/alert_group.py | 14 +- engine/apps/api/views/escalation_policy.py | 32 +- .../serializers/escalation_policies.py | 14 + .../tests/test_escalation_policies.py | 40 +++ engine/conftest.py | 9 + engine/settings/base.py | 1 + engine/settings/celery_task_routes.py | 1 + 23 files changed, 957 insertions(+), 5 deletions(-) create mode 100644 engine/apps/alerts/migrations/0060_relatedincident.py create mode 100644 engine/apps/alerts/models/related_incident.py create mode 100644 engine/apps/alerts/tasks/declare_incident.py create mode 100644 engine/apps/alerts/tests/test_related_incident.py diff --git a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py index e179dc92d4..b6a495934a 100644 --- a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py +++ b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py @@ -12,11 +12,13 @@ from apps.alerts.models.escalation_policy import EscalationPolicy from apps.alerts.tasks import ( custom_webhook_result, + declare_incident, notify_all_task, notify_group_task, notify_user_task, resolve_by_last_step_task, ) +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.ical_utils import list_users_to_notify_from_ical from apps.user_management.models import User @@ -136,6 +138,7 @@ def execute(self, alert_group: "AlertGroup", reason) -> StepExecutionResultData: EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: self._escalation_step_notify_if_num_alerts_in_time_window, EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS: self._escalation_step_notify_multiple_users, EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS_IMPORTANT: self._escalation_step_notify_multiple_users, + EscalationPolicy.STEP_DECLARE_INCIDENT: self._escalation_step_declare_incident, None: self._escalation_step_not_configured, } result = action_map[self.step](alert_group, reason) @@ -410,6 +413,32 @@ def _escalation_step_notify_team_members(self, alert_group: "AlertGroup", reason self._execute_tasks(tasks) + def _escalation_step_declare_incident(self, alert_group: "AlertGroup", _reason: str) -> None: + grafana_declare_incident_enabled = is_declare_incident_step_enabled( + organization=alert_group.channel.organization + ) + if not grafana_declare_incident_enabled: + AlertGroupLogRecord( + type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED, + alert_group=alert_group, + reason="Declare Incident step is not enabled", + escalation_policy=self.escalation_policy, + escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED, + escalation_policy_step=self.step, + ).save() + return + tasks = [] + declare_incident_task = declare_incident.signature( + args=(alert_group.pk,), + kwargs={ + "escalation_policy_pk": self.id, + "severity": self.severity, + }, + immutable=True, + ) + tasks.append(declare_incident_task) + self._execute_tasks(tasks) + def _escalation_step_notify_if_time(self, alert_group: "AlertGroup", _reason: str) -> StepExecutionResultData: eta = None diff --git a/engine/apps/alerts/migrations/0060_relatedincident.py b/engine/apps/alerts/migrations/0060_relatedincident.py new file mode 100644 index 0000000000..d044cdbf64 --- /dev/null +++ b/engine/apps/alerts/migrations/0060_relatedincident.py @@ -0,0 +1,30 @@ +# Generated by Django 4.2.15 on 2024-10-04 16:38 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('user_management', '0022_alter_team_unique_together'), + ('alerts', '0059_escalationpolicy_severity_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='RelatedIncident', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('incident_id', models.CharField(db_index=True, max_length=50)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('is_active', models.BooleanField(default=True)), + ('attached_alert_groups', models.ManyToManyField(related_name='related_incidents', to='alerts.alertgroup')), + ('channel_filter', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='related_incidents', to='alerts.channelfilter')), + ('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='related_incidents', to='user_management.organization')), + ], + options={ + 'unique_together': {('organization', 'incident_id')}, + }, + ), + ] diff --git a/engine/apps/alerts/models/__init__.py b/engine/apps/alerts/models/__init__.py index 51b4415844..24ea0905fe 100644 --- a/engine/apps/alerts/models/__init__.py +++ b/engine/apps/alerts/models/__init__.py @@ -13,6 +13,7 @@ from .grafana_alerting_contact_point import GrafanaAlertingContactPoint # noqa: F401 from .invitation import Invitation # noqa: F401 from .maintainable_object import MaintainableObject # noqa: F401 +from .related_incident import RelatedIncident # noqa: F401 from .resolution_note import ResolutionNote, ResolutionNoteSlackMessage # noqa: F401 from .user_has_notification import UserHasNotification # noqa: F401 from .user_notification_bundle import BundledNotification, UserNotificationBundle # noqa: F401 diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 81ac41b95e..6a9062bdd4 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -44,6 +44,7 @@ AlertGroupLogRecord, AlertReceiveChannel, BundledNotification, + RelatedIncident, ResolutionNote, ResolutionNoteSlackMessage, ) @@ -193,6 +194,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. acknowledged_by_user: typing.Optional["User"] alerts: "RelatedManager['Alert']" bundled_notifications: "RelatedManager['BundledNotification']" + related_incidents: "RelatedManager['RelatedIncident']" dependent_alert_groups: "RelatedManager['AlertGroup']" channel: "AlertReceiveChannel" log_records: "RelatedManager['AlertGroupLogRecord']" diff --git a/engine/apps/alerts/models/alert_group_log_record.py b/engine/apps/alerts/models/alert_group_log_record.py index f4b796aada..ea2b2c18fe 100644 --- a/engine/apps/alerts/models/alert_group_log_record.py +++ b/engine/apps/alerts/models/alert_group_log_record.py @@ -11,18 +11,24 @@ from apps.alerts import tasks from apps.alerts.constants import ActionSource +from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.utils import render_relative_timeline from apps.slack.slack_formatter import SlackFormatter from common.utils import clean_markup if typing.TYPE_CHECKING: from apps.alerts.models import AlertGroup, CustomButton, EscalationPolicy, Invitation - from apps.user_management.models import User + from apps.user_management.models import Organization, User logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) +class RelatedIncidentData(typing.TypedDict): + incident_link: typing.Optional[str] + incident_title: str + + class AlertGroupLogRecord(models.Model): alert_group: "AlertGroup" author: typing.Optional["User"] @@ -161,7 +167,9 @@ class AlertGroupLogRecord(models.Model): ERROR_ESCALATION_TRIGGER_CUSTOM_WEBHOOK_ERROR, ERROR_ESCALATION_NOTIFY_TEAM_MEMBERS_STEP_IS_NOT_CONFIGURED, ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED, - ) = range(20) + ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED, + ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED, + ) = range(22) type = models.IntegerField(choices=TYPE_CHOICES) @@ -225,7 +233,14 @@ class AlertGroupLogRecord(models.Model): escalation_policy_step = models.IntegerField(null=True, default=None) step_specific_info = JSONField(null=True, default=None) - STEP_SPECIFIC_INFO_KEYS = ["schedule_name", "custom_button_name", "usergroup_handle", "source_integration_name"] + STEP_SPECIFIC_INFO_KEYS = [ + "schedule_name", + "custom_button_name", + "usergroup_handle", + "source_integration_name", + "incident_id", + "incident_title", + ] def _make_log_line_link(self, url, title, html=False, for_slack=False, substitute_with_tag=False): if html and url: @@ -244,6 +259,7 @@ def render_log_line_json(self): author = self.author.short(organization) if self.author is not None else None escalation_chain = self.alert_group.channel_filter.escalation_chain if self.alert_group.channel_filter else None step_info = self.get_step_specific_info() + related_incident = self.render_incident_data_from_step_info(organization, step_info) escalation_chain_data = ( { "pk": escalation_chain.public_primary_key, @@ -280,6 +296,7 @@ def render_log_line_json(self): "type": self.type, "created_at": created_at, "author": author, + "incident": related_incident, "escalation_chain": escalation_chain_data, "schedule": schedule, "webhook": webhook, @@ -425,6 +442,14 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_with_ result += f'triggered step "Notify on-call from Schedule {schedule_text}{important_text}"' elif escalation_policy_step == EscalationPolicy.STEP_REPEAT_ESCALATION_N_TIMES: result += "escalation started from the beginning" + elif escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT: + organization = self.alert_group.channel.organization + incident_data = self.render_incident_data_from_step_info(organization, step_specific_info) + incident_link = incident_data["incident_link"] + incident_title = incident_data["incident_title"] + tag = "related_incident" if substitute_with_tag else False + incident_text = self._make_log_line_link(incident_link, incident_title, html, for_slack, tag) + result += self.reason + f": {incident_text}" else: result += f'triggered step "{EscalationPolicy.get_step_display_name(escalation_policy_step)}"' elif self.type == AlertGroupLogRecord.TYPE_SILENCE: @@ -640,8 +665,32 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_with_ result += f"failed to notify User Group{usergroup_handle_text} in Slack" elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED: result += 'skipped escalation step "Trigger Outgoing Webhook" because it is disabled' + elif ( + self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED + ): + result += 'skipped escalation step "Declare Incident": step is not enabled' + elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED: + result += "failed to declare an Incident" + if self.reason: + result += f": {self.reason}" return result + def render_incident_data_from_step_info( + self, organization: "Organization", step_specific_info: dict + ) -> RelatedIncidentData | None: + from apps.alerts.models.related_incident import get_incident_url + + if not step_specific_info or not all(key in step_specific_info for key in ["incident_title", "incident_id"]): + return None + + incident_link = ( + get_incident_url(organization, step_specific_info["incident_id"]) + if step_specific_info["incident_id"] + else None + ) + incident_title = step_specific_info["incident_title"] or DEFAULT_BACKUP_TITLE + return {"incident_link": incident_link, "incident_title": incident_title} + def get_step_specific_info(self): step_specific_info = None # in some cases step_specific_info was saved with using json.dumps diff --git a/engine/apps/alerts/models/escalation_policy.py b/engine/apps/alerts/models/escalation_policy.py index 1f74ef6043..28ea7022ac 100644 --- a/engine/apps/alerts/models/escalation_policy.py +++ b/engine/apps/alerts/models/escalation_policy.py @@ -92,6 +92,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME, STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, ] # Steps can be stored in db while interacting with internal api # Includes important versions of default steps @@ -218,6 +219,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME, STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, ] PUBLIC_STEP_CHOICES_MAP = { @@ -239,6 +241,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME: "notify_if_time_from_to", STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: "notify_if_num_alerts_in_window", STEP_REPEAT_ESCALATION_N_TIMES: "repeat_escalation", + STEP_DECLARE_INCIDENT: "declare_incident", } public_primary_key = models.CharField( diff --git a/engine/apps/alerts/models/related_incident.py b/engine/apps/alerts/models/related_incident.py new file mode 100644 index 0000000000..61f340cf81 --- /dev/null +++ b/engine/apps/alerts/models/related_incident.py @@ -0,0 +1,48 @@ +import typing +from urllib.parse import urljoin + +from django.db import models + +from common.constants.plugin_ids import PluginID + +if typing.TYPE_CHECKING: + from django.db.models.manager import RelatedManager + + from apps.alerts.models import AlertGroup, ChannelFilter + from apps.user_management.models import Organization + + +def get_incident_url(organization, incident_id) -> str: + return urljoin(organization.grafana_url, f"a/{PluginID.INCIDENT}/incidents/{incident_id}") + + +class RelatedIncident(models.Model): + attached_alert_groups: "RelatedManager['AlertGroup']" + channel_filter: typing.Optional["ChannelFilter"] + organization: "Organization" + + incident_id = models.CharField(db_index=True, max_length=50) + organization = models.ForeignKey( + "user_management.Organization", + on_delete=models.CASCADE, + related_name="related_incidents", + ) + channel_filter = models.ForeignKey( + "alerts.ChannelFilter", + on_delete=models.SET_NULL, + null=True, + related_name="related_incidents", + ) + created_at = models.DateTimeField(auto_now_add=True) + is_active = models.BooleanField(default=True) + + attached_alert_groups = models.ManyToManyField( + "alerts.AlertGroup", + related_name="related_incidents", + ) + + class Meta: + unique_together = ("organization", "incident_id") + + def get_incident_link(self) -> str: + return get_incident_url(self.organization, self.incident_id) diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 056140a3eb..e89f96cb70 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -5,6 +5,7 @@ ) from .check_escalation_finished import check_escalation_finished_task # noqa: F401 from .custom_webhook_result import custom_webhook_result # noqa: F401 +from .declare_incident import declare_incident # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 from .delete_alert_group import finish_delete_alert_group # noqa: F401 from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401 diff --git a/engine/apps/alerts/tasks/declare_incident.py b/engine/apps/alerts/tasks/declare_incident.py new file mode 100644 index 0000000000..b0003534f1 --- /dev/null +++ b/engine/apps/alerts/tasks/declare_incident.py @@ -0,0 +1,148 @@ +import logging + +from django.conf import settings + +from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE +from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from common.incident_api.client import ( + DEFAULT_INCIDENT_SEVERITY, + DEFAULT_INCIDENT_STATUS, + IncidentAPIClient, + IncidentAPIException, +) + +logger = logging.getLogger(__name__) + +ATTACHMENT_CAPTION = "OnCall Alert Group" +ERROR_SEVERITY_NOT_FOUND = "Severity.FindOne: not found" +MAX_RETRIES = 1 if settings.DEBUG else 10 +MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT = 5 + + +def _attach_alert_group_to_incident(alert_group, incident_id, incident_title, escalation_policy, attached=False): + from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy, RelatedIncident + + declared_incident, _ = RelatedIncident.objects.get_or_create( + incident_id=incident_id, + organization=alert_group.channel.organization, + defaults={ + "channel_filter": alert_group.channel_filter, + }, + ) + declared_incident.attached_alert_groups.add(alert_group) + reason = "attached to existing incident" if attached else "incident declared" + AlertGroupLogRecord.objects.create( + type=AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, + reason=reason, + alert_group=alert_group, + step_specific_info={"incident_id": incident_id, "incident_title": incident_title}, + escalation_policy=escalation_policy, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + + +def _create_error_log_record(alert_group, escalation_policy, reason=""): + from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy + + AlertGroupLogRecord.objects.create( + type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED, + escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED, + reason=reason, + alert_group=alert_group, + escalation_policy=escalation_policy, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + + +@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES) +def declare_incident(alert_group_pk, escalation_policy_pk, severity=None): + from apps.alerts.models import AlertGroup, EscalationPolicy, RelatedIncident + + alert_group = AlertGroup.objects.get(pk=alert_group_pk) + organization = alert_group.channel.organization + escalation_policy = None + if escalation_policy_pk: + escalation_policy = EscalationPolicy.objects.filter(pk=escalation_policy_pk).first() + + if alert_group.channel_filter.is_default: + _create_error_log_record( + alert_group, escalation_policy, reason="Declare incident step is not enabled for default routes" + ) + return + + if declare_incident.request.retries == MAX_RETRIES: + _create_error_log_record(alert_group, escalation_policy) + return + + incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token) + + # check for currently active related incident in the same route (channel_filter) + existing_incident = ( + RelatedIncident.objects.filter( + organization=organization, channel_filter=alert_group.channel_filter, is_active=True + ) + .order_by("-created_at") + .first() + ) + + if existing_incident: + incident_id = existing_incident.incident_id + try: + # get existing incident details + incident_data, _ = incident_client.get_incident(incident_id) + except IncidentAPIException as e: + logger.error(f"Error getting incident details: {e.msg}") + if e.status == 404: + # incident not found, mark as not opened + existing_incident.is_active = False + existing_incident.save(update_fields=["is_active"]) + else: + # raise (and retry) + raise + else: + # incident exists, check if it is still active + if incident_data["status"] == DEFAULT_INCIDENT_STATUS: + # attach to incident context + incident_title = incident_data["title"] + num_attached = existing_incident.attached_alert_groups.count() + if num_attached < MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT: + try: + incident_data, _ = incident_client.add_activity(incident_id, alert_group.web_link) + except IncidentAPIException as e: + logger.error(f"Error attaching to existing incident: {e.msg}") + # setup association between alert group and incident (even if not attached) + _attach_alert_group_to_incident( + alert_group, incident_id, incident_title, escalation_policy, attached=True + ) + else: + existing_incident.is_active = False + existing_incident.save(update_fields=["is_active"]) + + if existing_incident is None or not existing_incident.is_active: + # create new incident + if severity == EscalationPolicy.SEVERITY_SET_FROM_LABEL: + severity_label = alert_group.labels.filter(key_name="severity").first() + severity = severity_label.value_name if severity_label else None + severity = severity or DEFAULT_INCIDENT_SEVERITY + try: + incident_data, _ = incident_client.create_incident( + alert_group.web_title_cache if alert_group.web_title_cache else DEFAULT_BACKUP_TITLE, + severity=severity, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + except IncidentAPIException as e: + logger.error(f"Error creating new incident: {e.msg}") + if ERROR_SEVERITY_NOT_FOUND.lower() in e.msg.lower() and severity != DEFAULT_INCIDENT_SEVERITY: + # invalid severity, retry with default severity + declare_incident.apply_async( + args=(alert_group_pk, escalation_policy_pk), + kwargs={"severity": DEFAULT_INCIDENT_SEVERITY}, + ) + return + # else raise (and retry) + raise + else: + _attach_alert_group_to_incident( + alert_group, incident_data["incidentID"], incident_data["title"], escalation_policy + ) diff --git a/engine/apps/alerts/tests/factories.py b/engine/apps/alerts/tests/factories.py index f07ef90046..6a519ccf1e 100644 --- a/engine/apps/alerts/tests/factories.py +++ b/engine/apps/alerts/tests/factories.py @@ -11,6 +11,7 @@ EscalationChain, EscalationPolicy, Invitation, + RelatedIncident, ResolutionNote, ResolutionNoteSlackMessage, UserNotificationBundle, @@ -91,3 +92,8 @@ class Meta: class UserNotificationBundleFactory(factory.DjangoModelFactory): class Meta: model = UserNotificationBundle + + +class RelatedIncidentFactory(factory.DjangoModelFactory): + class Meta: + model = RelatedIncident diff --git a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py index 8a3eef6008..8882a37012 100644 --- a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py +++ b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py @@ -690,3 +690,52 @@ def test_notify_team_members( (user_2.pk, alert_group.pk), expected_kwargs, immutable=True ) assert mock_execute.signature.call_count == 2 + + +@pytest.mark.django_db +def test_escalation_step_declare_incident( + escalation_step_test_setup, + make_escalation_policy, +): + organization, _, _, channel_filter, alert_group, reason = escalation_step_test_setup + + declare_incident_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + escalation_policy_snapshot = get_escalation_policy_snapshot_from_model(declare_incident_step) + expected_eta = timezone.now() + timezone.timedelta(seconds=NEXT_ESCALATION_DELAY) + with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks: + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled", + return_value=True, + ): + result = escalation_policy_snapshot.execute(alert_group, reason) + expected_result = EscalationPolicySnapshot.StepExecutionResultData( + eta=result.eta, + stop_escalation=False, + pause_escalation=False, + start_from_beginning=False, + ) + assert ( + expected_eta + timezone.timedelta(seconds=15) + > result.eta + > expected_eta - timezone.timedelta(seconds=15) + ) + assert result == expected_result + assert not alert_group.log_records.exists() + mocked_execute_tasks.assert_called_once() + with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks: + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled", + return_value=False, + ): + escalation_policy_snapshot.execute(alert_group, reason) + mocked_execute_tasks.assert_not_called() + assert alert_group.log_records.exists() + log_record = alert_group.log_records.get() + assert log_record.type == AlertGroupLogRecord.TYPE_ESCALATION_FAILED + assert ( + log_record.escalation_error_code + == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED + ) diff --git a/engine/apps/alerts/tests/test_related_incident.py b/engine/apps/alerts/tests/test_related_incident.py new file mode 100644 index 0000000000..a2dfd95645 --- /dev/null +++ b/engine/apps/alerts/tests/test_related_incident.py @@ -0,0 +1,332 @@ +from unittest.mock import patch + +import httpretty +import pytest + +from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy, RelatedIncident +from apps.alerts.tasks.declare_incident import ( + ATTACHMENT_CAPTION, + DEFAULT_BACKUP_TITLE, + DEFAULT_INCIDENT_SEVERITY, + ERROR_SEVERITY_NOT_FOUND, + MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT, + declare_incident, +) +from common.incident_api.client import IncidentAPIException + + +@pytest.fixture +def setup_alert_group_and_escalation_step( + make_organization, + make_alert_receive_channel, + make_alert_group, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, +): + def _setup_alert_group_and_escalation_step(is_default_route=False, already_declared_incident=False): + organization = make_organization(grafana_url="https://stack.grafana.net", api_token="token") + alert_receive_channel = make_alert_receive_channel(organization=organization) + escalation_chain = make_escalation_chain(organization) + declare_incident_step = make_escalation_policy( + escalation_chain=escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + channel_filter = make_channel_filter( + alert_receive_channel, + escalation_chain=escalation_chain, + is_default=is_default_route, + ) + alert_group = make_alert_group( + alert_receive_channel=alert_receive_channel, + channel_filter=channel_filter, + ) + declared_incident = None + if already_declared_incident: + declared_incident = RelatedIncident.objects.create( + incident_id="123", + organization=organization, + channel_filter=channel_filter, + ) + + return alert_group, declare_incident_step, declared_incident + + return _setup_alert_group_and_escalation_step + + +@pytest.mark.django_db +def test_declare_incident_default_route(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(is_default_route=True) + + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + # check triggered log + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_FAILED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info is None + assert log_record.reason == "Declare incident step is not enabled for default routes" + assert log_record.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_ok(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, + severity=DEFAULT_INCIDENT_SEVERITY, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + + alert_group.refresh_from_db() + + # check declared incident + new_incident = alert_group.related_incidents.get() + assert new_incident.incident_id == "123" + assert new_incident.organization == alert_group.channel.organization + assert new_incident.channel_filter == alert_group.channel_filter + # check triggered log + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": "123", "incident_title": "Incident"} + assert log_record.reason == "incident declared" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_set_severity(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + severity = "critical" + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_set_severity_from_label(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + expected_severity = "minor" + # set alert group label + alert_group.labels.create( + organization=alert_group.channel.organization, key_name="severity", value_name=expected_severity + ) + severity = EscalationPolicy.SEVERITY_SET_FROM_LABEL + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, + severity=expected_severity, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_invalid_severity_fallback(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + severity = "INVALID" + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + with patch.object(declare_incident, "apply_async") as mock_declare_incident_apply_async: + mock_create_incident.side_effect = IncidentAPIException( + status=500, url="some-url", msg=ERROR_SEVERITY_NOT_FOUND + ) + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + # create call failing with invalid severity + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link + ) + # new task is queued with default severity instead + mock_declare_incident_apply_async.assert_called_with( + args=(alert_group.pk, declare_incident_step.pk), kwargs={"severity": DEFAULT_INCIDENT_SEVERITY} + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_attach_alert_group(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + mock_add_activity.return_value = {"activityItemID": "111"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + # check declared incident + assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_resolved_update(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + new_incident_id = "333" + assert new_incident_id != incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_get_incident.return_value = { + "incidentID": incident_id, + "title": "Incident1", + "status": "resolved", + }, None + mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident2"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + existing_open_incident.refresh_from_db() + + assert existing_open_incident.is_active is False + # check declared incident + assert not existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + assert alert_group.related_incidents.get().incident_id == new_incident_id + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": new_incident_id, "incident_title": "Incident2"} + assert log_record.reason == "incident declared" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_attach_alert_group_skip_incident_update( + setup_alert_group_and_escalation_step, make_alert_group +): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + alert_receive_channel = alert_group.channel + channel_filter = alert_group.channel_filter + incident_id = existing_open_incident.incident_id + + # attach max alert groups to incident + for _ in range(MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT): + ag = make_alert_group(alert_receive_channel=alert_receive_channel, channel_filter=channel_filter) + existing_open_incident.attached_alert_groups.add(ag) + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + assert not mock_add_activity.called + + # check declared incident + assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_get_existing_incident_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + mock_get_incident.side_effect = IncidentAPIException(status=500, url="some-url") + with pytest.raises(IncidentAPIException): + declare_incident(alert_group.pk, declare_incident_step.pk) + + # but if incident was not found, a new one should be created + incident_id = existing_open_incident.incident_id + new_incident_id = "333" + assert new_incident_id != incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_get_incident.side_effect = IncidentAPIException(status=404, url="some-url") + mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + + # check declared incident + assert not existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + new_incident = alert_group.related_incidents.get() + assert new_incident != existing_open_incident + assert new_incident.incident_id == new_incident_id + assert new_incident.organization == alert_group.channel.organization + assert new_incident.channel_filter == alert_group.channel_filter + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_attach_alert_group_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + mock_add_activity.side_effect = IncidentAPIException(status=500, url="some-url") + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + + # incident attachment failed, but DB is still updated + assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_create_incident_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.side_effect = IncidentAPIException(status=500, url="some-url") + with pytest.raises(IncidentAPIException): + declare_incident(alert_group.pk, declare_incident_step.pk) diff --git a/engine/apps/alerts/utils.py b/engine/apps/alerts/utils.py index abf6b24cde..5317c22b3f 100644 --- a/engine/apps/alerts/utils.py +++ b/engine/apps/alerts/utils.py @@ -1,3 +1,11 @@ +import typing + +from django.conf import settings + +if typing.TYPE_CHECKING: + from apps.user_management.models import Organization + + def render_relative_timeline(log_created_at, alert_group_started_at): time_delta = log_created_at - alert_group_started_at seconds = int(time_delta.total_seconds()) @@ -12,3 +20,7 @@ def render_relative_timeline(log_created_at, alert_group_started_at): return "%dm%ds" % (minutes, seconds) else: return "%ds" % (seconds,) + + +def is_declare_incident_step_enabled(organization: "Organization") -> bool: + return organization.is_grafana_incident_enabled and settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED diff --git a/engine/apps/api/serializers/escalation_policy.py b/engine/apps/api/serializers/escalation_policy.py index 75f3628488..f8b0270de8 100644 --- a/engine/apps/api/serializers/escalation_policy.py +++ b/engine/apps/api/serializers/escalation_policy.py @@ -3,6 +3,7 @@ from rest_framework import serializers from apps.alerts.models import EscalationChain, EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.models import OnCallSchedule from apps.slack.models import SlackUserGroup from apps.user_management.models import Team, User @@ -24,6 +25,7 @@ NUM_ALERTS_IN_WINDOW = "num_alerts_in_window" NUM_MINUTES_IN_WINDOW = "num_minutes_in_window" CUSTOM_WEBHOOK_TRIGGER = "custom_webhook" +SEVERITY = "severity" STEP_TYPE_TO_RELATED_FIELD_MAP = { EscalationPolicy.STEP_WAIT: [WAIT_DELAY], @@ -35,6 +37,7 @@ EscalationPolicy.STEP_NOTIFY_IF_TIME: [FROM_TIME, TO_TIME], EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: [NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW], EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK: [CUSTOM_WEBHOOK_TRIGGER], + EscalationPolicy.STEP_DECLARE_INCIDENT: [SEVERITY], } @@ -81,6 +84,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer) allow_null=True, filter_field="organization", ) + severity = serializers.CharField(required=False, allow_null=True) class Meta: model = EscalationPolicy @@ -99,6 +103,7 @@ class Meta: "notify_schedule", "notify_to_group", "notify_to_team_members", + "severity", "important", ] @@ -123,6 +128,7 @@ def validate(self, data): NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW, CUSTOM_WEBHOOK_TRIGGER, + SEVERITY, ] step = data.get("step") @@ -151,6 +157,8 @@ def validate_step(self, step_type): raise serializers.ValidationError("Invalid step value") if step_type in EscalationPolicy.SLACK_INTEGRATION_REQUIRED_STEPS and organization.slack_team_identity is None: raise serializers.ValidationError("Invalid escalation step type: step is Slack-specific") + if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization): + raise serializers.ValidationError("Invalid escalation step type: step is not enabled") return step_type def to_representation(self, instance): @@ -214,6 +222,7 @@ def _drop_not_step_type_related_fields(step, validated_data): NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW, CUSTOM_WEBHOOK_TRIGGER, + SEVERITY, ] for f in STEP_TYPE_TO_RELATED_FIELD_MAP.get(step, []): diff --git a/engine/apps/api/tests/test_alert_group.py b/engine/apps/api/tests/test_alert_group.py index a015fccc3e..8ee438b6bc 100644 --- a/engine/apps/api/tests/test_alert_group.py +++ b/engine/apps/api/tests/test_alert_group.py @@ -975,6 +975,37 @@ def test_get_filter_labels( assert response.json()["results"][0]["pk"] == alert_groups[0].public_primary_key +@pytest.mark.django_db +def test_get_filter_by_related_incident( + alert_group_internal_api_setup, make_related_incident, make_alert_group, make_user_auth_headers +): + user, token, alert_groups = alert_group_internal_api_setup + + alert_group = alert_groups[0] + related_incident = make_related_incident("1", alert_group.channel.organization, alert_group.channel_filter) + related_incident.attached_alert_groups.add(alert_group) + + client = APIClient() + url = reverse("api-internal:alertgroup-list") + response = client.get( + url + "?has_related_incident=true", + format="json", + **make_user_auth_headers(user, token), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data["results"]) == 1 + + response = client.get( + url + "?has_related_incident=false", + format="json", + **make_user_auth_headers(user, token), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data["results"]) == 3 + + @pytest.mark.django_db def test_get_title_search( settings, diff --git a/engine/apps/api/tests/test_escalation_policy.py b/engine/apps/api/tests/test_escalation_policy.py index 0a5f719ec4..0c1b329970 100644 --- a/engine/apps/api/tests/test_escalation_policy.py +++ b/engine/apps/api/tests/test_escalation_policy.py @@ -10,6 +10,7 @@ from apps.alerts.models import EscalationPolicy from apps.api.permissions import LegacyAccessControlRole +from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIException @pytest.fixture() @@ -651,8 +652,13 @@ def test_create_escalation_policy_with_no_important_version( make_escalation_chain, step, make_user_auth_headers, + settings, ): organization, user, _, _ = make_organization_and_user_with_slack_identities() + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() _, token = make_token_for_organization(organization) escalation_chain = make_escalation_chain(organization) @@ -832,6 +838,7 @@ def test_escalation_policy_switch_importance( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": True, "wait_delay": None, } @@ -889,6 +896,7 @@ def test_escalation_policy_filter_by_user( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, { @@ -906,6 +914,7 @@ def test_escalation_policy_filter_by_user( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, ] @@ -971,6 +980,7 @@ def test_escalation_policy_filter_by_slack_channel( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, ] @@ -1001,3 +1011,88 @@ def test_escalation_policy_escalation_options_webhooks( returned_options = [option["value"] for option in response.json()] assert EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK in returned_options + + +@pytest.mark.django_db +def test_escalation_policy_severity_options( + make_organization_and_user_with_plugin_token, + make_user_auth_headers, +): + organization, user, token = make_organization_and_user_with_plugin_token() + organization.is_grafana_labels_enabled = False + organization.save() + + client = APIClient() + url = reverse("api-internal:escalation_policy-severity-options") + + # without labels enabled + available_severities = [ + {"severityID": "abc", "orgID": "1", "displayLabel": "Pending", "level": -1}, + {"severityID": "def", "orgID": "1", "displayLabel": "Critical", "level": 1}, + ] + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.return_value = available_severities, None + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + + expected_options = [{"value": s["displayLabel"], "display_name": s["displayLabel"]} for s in available_severities] + assert response.json() == expected_options + + # failing request does not break; fallback to default option only + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.side_effect = IncidentAPIException(status=404, url="some-url") + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + + fallback_options = [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}] + assert response.json() == fallback_options + + # labels enabled + organization.is_grafana_labels_enabled = True + organization.save() + + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.return_value = available_severities, None + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + # include set from label option + expected_options = [ + { + "value": EscalationPolicy.SEVERITY_SET_FROM_LABEL, + "display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE, + } + ] + expected_options + assert response.json() == expected_options + + +@pytest.mark.django_db +def test_create_escalation_policy_declare_incident( + escalation_policy_internal_api_setup, make_user_auth_headers, settings +): + token, escalation_chain, _, user, _ = escalation_policy_internal_api_setup + organization = escalation_chain.organization + client = APIClient() + url = reverse("api-internal:escalation_policy-list") + + data = { + "step": EscalationPolicy.STEP_DECLARE_INCIDENT, + "severity": "critical", + "escalation_chain": escalation_chain.public_primary_key, + } + + response = client.post(url, data, format="json", **make_user_auth_headers(user, token)) + assert response.status_code == status.HTTP_400_BAD_REQUEST + + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() + + response = client.post(url, data, format="json", **make_user_auth_headers(user, token)) + assert response.status_code == status.HTTP_201_CREATED + escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) + assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert escalation_policy.severity == "critical" + + url = reverse("api-internal:escalation_policy-detail", kwargs={"pk": escalation_policy.public_primary_key}) + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + response_data = response.json() + assert response_data["step"] == EscalationPolicy.STEP_DECLARE_INCIDENT + assert response_data["severity"] == "critical" diff --git a/engine/apps/api/views/alert_group.py b/engine/apps/api/views/alert_group.py index 22d0be4a9c..c937a78488 100644 --- a/engine/apps/api/views/alert_group.py +++ b/engine/apps/api/views/alert_group.py @@ -17,6 +17,7 @@ from apps.alerts.models import AlertGroup, AlertReceiveChannel, EscalationChain, ResolutionNote from apps.alerts.paging import unpage_user from apps.alerts.tasks import delete_alert_group, send_update_resolution_note_signal +from apps.alerts.utils import is_declare_incident_step_enabled from apps.api.errors import AlertGroupAPIError from apps.api.label_filtering import parse_label_query from apps.api.permissions import RBACPermission @@ -120,6 +121,7 @@ class AlertGroupFilter(DateRangeFilterMixin, ModelFieldFilterMixin, filters.Filt ) with_resolution_note = filters.BooleanFilter(method="filter_with_resolution_note") mine = filters.BooleanFilter(method="filter_mine") + has_related_incident = filters.BooleanFilter(field_name="related_incidents", lookup_expr="isnull", exclude=True) def filter_status(self, queryset, name, value): if not value: @@ -719,6 +721,7 @@ def filters(self, request): """ Retrieve a list of valid filter options that can be used to filter alert groups """ + organization = self.request.auth.organization api_root = "/api/internal/v1/" default_day_range = 30 @@ -804,7 +807,7 @@ def filters(self, request): filter_options = [{"name": "search", "type": "search", "description": description}] + filter_options - if is_labels_feature_enabled(self.request.auth.organization): + if is_labels_feature_enabled(organization): filter_options.append( { "name": "label", @@ -813,6 +816,15 @@ def filters(self, request): } ) + if is_declare_incident_step_enabled(organization): + filter_options.append( + { + "name": "has_related_incident", + "type": "boolean", + "default": "true", + } + ) + return Response(filter_options) @extend_schema( diff --git a/engine/apps/api/views/escalation_policy.py b/engine/apps/api/views/escalation_policy.py index 2cb288be0b..eb502b5ce6 100644 --- a/engine/apps/api/views/escalation_policy.py +++ b/engine/apps/api/views/escalation_policy.py @@ -1,3 +1,5 @@ +import logging + from django.conf import settings from django.db.models import Q from rest_framework.decorators import action @@ -5,6 +7,7 @@ from rest_framework.response import Response from apps.alerts.models import EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.api.permissions import RBACPermission from apps.api.serializers.escalation_policy import ( EscalationPolicyCreateSerializer, @@ -19,9 +22,12 @@ TeamFilteringMixin, UpdateSerializerMixin, ) +from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIClient, IncidentAPIException from common.insight_log import EntityEvent, write_resource_insight_log from common.ordered_model.viewset import OrderedModelViewSet +logger = logging.getLogger(__name__) + class EscalationPolicyView( TeamFilteringMixin, @@ -42,6 +48,7 @@ class EscalationPolicyView( "escalation_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "delay_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "num_minutes_in_window_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], + "severity_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "create": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], "update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], "partial_update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], @@ -116,6 +123,7 @@ def perform_destroy(self, instance): @action(detail=False, methods=["get"]) def escalation_options(self, request): + grafana_declare_incident_enabled = is_declare_incident_step_enabled(organization=self.request.auth.organization) choices = [] for step in EscalationPolicy.INTERNAL_API_STEPS: verbal = EscalationPolicy.INTERNAL_API_STEPS_TO_VERBAL_MAP[step] @@ -126,7 +134,7 @@ def escalation_options(self, request): if slack_integration_required and not settings.FEATURE_SLACK_INTEGRATION_ENABLED: continue - if step == EscalationPolicy.STEP_DECLARE_INCIDENT: + if step == EscalationPolicy.STEP_DECLARE_INCIDENT and not grafana_declare_incident_enabled: continue choices.append( @@ -155,3 +163,25 @@ def num_minutes_in_window_options(self, request): {"value": choice[0], "display_name": choice[1]} for choice in EscalationPolicy.WEB_DURATION_CHOICES_MINUTES ] return Response(choices) + + @action(detail=False, methods=["get"]) + def severity_options(self, request): + organization = self.request.auth.organization + choices = [] + if organization.is_grafana_labels_enabled: + choices = [ + { + "value": EscalationPolicy.SEVERITY_SET_FROM_LABEL, + "display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE, + } + ] + incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token) + try: + severities, _ = incident_client.get_severities() + choices += [ + {"value": severity["displayLabel"], "display_name": severity["displayLabel"]} for severity in severities + ] + except IncidentAPIException as e: + logger.error(f"Error getting severities: {e.msg}") + choices += [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}] + return Response(choices) diff --git a/engine/apps/public_api/serializers/escalation_policies.py b/engine/apps/public_api/serializers/escalation_policies.py index ba40ff3030..54fb35addb 100644 --- a/engine/apps/public_api/serializers/escalation_policies.py +++ b/engine/apps/public_api/serializers/escalation_policies.py @@ -5,6 +5,7 @@ from rest_framework import fields, serializers from apps.alerts.models import EscalationChain, EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.models import OnCallSchedule from apps.slack.models import SlackUserGroup from apps.user_management.models import Team, User @@ -72,6 +73,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): required=False, source="custom_webhook", ) + severity = serializers.CharField(required=False) important = serializers.BooleanField(required=False) TIME_FORMAT = "%H:%M:%SZ" @@ -101,6 +103,7 @@ class Meta: "notify_if_time_to", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] PREFETCH_RELATED = ["notify_to_users_queue"] @@ -120,6 +123,9 @@ def validate_type(self, step_type): if step_type == EscalationPolicy.STEP_FINAL_NOTIFYALL and organization.slack_team_identity is None: raise BadRequest(detail="Invalid escalation step type: step is Slack-specific") + if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization): + raise BadRequest("Invalid escalation step type: step is not enabled") + return step_type def create(self, validated_data): @@ -163,6 +169,7 @@ def _get_field_to_represent(self, step, result): "notify_if_time_to", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] if step == EscalationPolicy.STEP_WAIT: fields_to_remove.remove("duration") @@ -190,6 +197,8 @@ def _get_field_to_represent(self, step, result): elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: fields_to_remove.remove("num_alerts_in_window") fields_to_remove.remove("num_minutes_in_window") + elif step == EscalationPolicy.STEP_DECLARE_INCIDENT: + fields_to_remove.remove("severity") if ( step in EscalationPolicy.DEFAULT_TO_IMPORTANT_STEP_MAPPING @@ -213,6 +222,7 @@ def _correct_validated_data(self, validated_data): "to_time", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] step = validated_data.get("step") important = validated_data.pop("important", None) @@ -243,6 +253,8 @@ def _correct_validated_data(self, validated_data): elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: validated_data_fields_to_remove.remove("num_alerts_in_window") validated_data_fields_to_remove.remove("num_minutes_in_window") + elif step == EscalationPolicy.STEP_DECLARE_INCIDENT: + validated_data_fields_to_remove.remove("severity") for field in validated_data_fields_to_remove: validated_data.pop(field, None) @@ -299,5 +311,7 @@ def update(self, instance, validated_data): if step != EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: instance.num_alerts_in_window = None instance.num_minutes_in_window = None + if step != EscalationPolicy.STEP_DECLARE_INCIDENT: + instance.severity = None return super().update(instance, validated_data) diff --git a/engine/apps/public_api/tests/test_escalation_policies.py b/engine/apps/public_api/tests/test_escalation_policies.py index 9cf961acc6..e1d478da89 100644 --- a/engine/apps/public_api/tests/test_escalation_policies.py +++ b/engine/apps/public_api/tests/test_escalation_policies.py @@ -463,3 +463,43 @@ def test_update_escalation_policy_using_notify_team_members( escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) serializer = EscalationPolicySerializer(escalation_policy) assert response.data == serializer.data + + +@pytest.mark.django_db +def test_create_escalation_policy_declare_incident( + make_organization_and_user_with_token, + escalation_policies_setup, + settings, +): + organization, user, token = make_organization_and_user_with_token() + escalation_chain, _, _ = escalation_policies_setup(organization, user) + + data_for_create = { + "escalation_chain_id": escalation_chain.public_primary_key, + "type": "declare_incident", + "position": 0, + "severity": "critical", + } + + client = APIClient() + url = reverse("api-public:escalation_policies-list") + response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token) + assert response.status_code == status.HTTP_400_BAD_REQUEST + + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() + + response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token) + assert response.status_code == status.HTTP_201_CREATED + + escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) + assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert escalation_policy.severity == "critical" + + url = reverse("api-public:escalation_policies-detail", kwargs={"pk": escalation_policy.public_primary_key}) + response = client.get(url, format="json", HTTP_AUTHORIZATION=token) + response_data = response.json() + assert response_data["type"] == EscalationPolicy.PUBLIC_STEP_CHOICES_MAP[EscalationPolicy.STEP_DECLARE_INCIDENT] + assert response_data["severity"] == "critical" diff --git a/engine/conftest.py b/engine/conftest.py index ec655a48bb..a95383dd94 100644 --- a/engine/conftest.py +++ b/engine/conftest.py @@ -35,6 +35,7 @@ EscalationChainFactory, EscalationPolicyFactory, InvitationFactory, + RelatedIncidentFactory, ResolutionNoteFactory, ResolutionNoteSlackMessageFactory, UserNotificationBundleFactory, @@ -1112,3 +1113,11 @@ def _make_user_notification_bundle(user, notification_channel, important=False, ) return _make_user_notification_bundle + + +@pytest.fixture +def make_related_incident(): + def _make_related_incident(incident_id, organization, channel_filter): + return RelatedIncidentFactory(incident_id=incident_id, organization=organization, channel_filter=channel_filter) + + return _make_related_incident diff --git a/engine/settings/base.py b/engine/settings/base.py index e189a5b879..4f0859f0f2 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -75,6 +75,7 @@ FEATURE_ALERT_GROUP_SEARCH_ENABLED = getenv_boolean("FEATURE_ALERT_GROUP_SEARCH_ENABLED", default=True) FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS = getenv_integer("FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS", default=None) FEATURE_NOTIFICATION_BUNDLE_ENABLED = getenv_boolean("FEATURE_NOTIFICATION_BUNDLE_ENABLED", default=True) +FEATURE_DECLARE_INCIDENT_STEP_ENABLED = getenv_boolean("FEATURE_DECLARE_INCIDENT_STEP_ENABLED", default=False) TWILIO_API_KEY_SID = os.environ.get("TWILIO_API_KEY_SID") TWILIO_API_KEY_SECRET = os.environ.get("TWILIO_API_KEY_SECRET") diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index ed58be1ab5..29309a7196 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -94,6 +94,7 @@ # CRITICAL "apps.alerts.tasks.acknowledge_reminder.acknowledge_reminder_task": {"queue": "critical"}, "apps.alerts.tasks.acknowledge_reminder.unacknowledge_timeout_task": {"queue": "critical"}, + "apps.alerts.tasks.declare_incident.declare_incident": {"queue": "critical"}, "apps.alerts.tasks.distribute_alert.send_alert_create_signal": {"queue": "critical"}, "apps.alerts.tasks.escalate_alert_group.escalate_alert_group": {"queue": "critical"}, "apps.alerts.tasks.invite_user_to_join_incident.invite_user_to_join_incident": {"queue": "critical"}, From bcdde314ed83f4b2819b8344446878a8a132bee2 Mon Sep 17 00:00:00 2001 From: Riksus <50514051+Riksus@users.noreply.github.com> Date: Mon, 7 Oct 2024 23:08:28 +0300 Subject: [PATCH 2/5] fix jinja_template values to int (#5132) # What this PR does this values must be int, OS setting it like strings, this PR fix it using getenv_integer() --- engine/settings/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/engine/settings/base.py b/engine/settings/base.py index 4f0859f0f2..88530f1b22 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -830,9 +830,9 @@ class BrokerTypes: GRAFANA_INCIDENT_STATIC_API_KEY = os.environ.get("GRAFANA_INCIDENT_STATIC_API_KEY", None) -JINJA_TEMPLATE_MAX_LENGTH = os.getenv("JINJA_TEMPLATE_MAX_LENGTH", 50000) -JINJA_RESULT_TITLE_MAX_LENGTH = os.getenv("JINJA_RESULT_TITLE_MAX_LENGTH", 500) -JINJA_RESULT_MAX_LENGTH = os.getenv("JINJA_RESULT_MAX_LENGTH", 50000) +JINJA_TEMPLATE_MAX_LENGTH = getenv_integer("JINJA_TEMPLATE_MAX_LENGTH", 50000) +JINJA_RESULT_TITLE_MAX_LENGTH = getenv_integer("JINJA_RESULT_TITLE_MAX_LENGTH", 500) +JINJA_RESULT_MAX_LENGTH = getenv_integer("JINJA_RESULT_MAX_LENGTH", 50000) # Log inbound/outbound calls as slow=1 if they exceed threshold SLOW_THRESHOLD_SECONDS = getenv_float("SLOW_THRESHOLD_SECONDS", 2.0) From 5cf382a2e3c940d88d33e0d91fb62efbae7d2967 Mon Sep 17 00:00:00 2001 From: Michael Derynck Date: Mon, 7 Oct 2024 14:08:53 -0600 Subject: [PATCH 3/5] Fix error encoding on install_v2 endpoint (#5133) # What this PR does Fixes incorrect encoding when SyncException contains one of the predefined dataclass errors in install_v2 endpoint. ## Which issue(s) this PR closes Related to #5124 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- .../grafana_plugin/tests/test_install_v2.py | 25 +++++++++++++++++++ .../apps/grafana_plugin/views/install_v2.py | 7 ++++-- 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 engine/apps/grafana_plugin/tests/test_install_v2.py diff --git a/engine/apps/grafana_plugin/tests/test_install_v2.py b/engine/apps/grafana_plugin/tests/test_install_v2.py new file mode 100644 index 0000000000..ed76d7260f --- /dev/null +++ b/engine/apps/grafana_plugin/tests/test_install_v2.py @@ -0,0 +1,25 @@ +from unittest.mock import patch + +import pytest +from django.urls import reverse +from rest_framework import status +from rest_framework.test import APIClient + +from apps.grafana_plugin.views.sync_v2 import SyncException +from common.api_helpers.errors import INVALID_SELF_HOSTED_ID + + +@pytest.mark.django_db +def test_install_v2_error_encoding(make_organization_and_user_with_plugin_token, make_user_auth_headers): + organization, user, token = make_organization_and_user_with_plugin_token() + client = APIClient() + + auth_headers = make_user_auth_headers(user, token) + + exc = SyncException(INVALID_SELF_HOSTED_ID) + + with patch("apps.grafana_plugin.views.InstallV2View.do_sync", side_effect=exc): + response = client.post(reverse("grafana-plugin:install-v2"), format="json", **auth_headers) + assert response.data["code"] == INVALID_SELF_HOSTED_ID.code + assert response.data["message"] == INVALID_SELF_HOSTED_ID.message + assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/engine/apps/grafana_plugin/views/install_v2.py b/engine/apps/grafana_plugin/views/install_v2.py index 7223adb038..02b19ef936 100644 --- a/engine/apps/grafana_plugin/views/install_v2.py +++ b/engine/apps/grafana_plugin/views/install_v2.py @@ -1,5 +1,5 @@ import logging -from dataclasses import asdict +from dataclasses import asdict, is_dataclass from django.conf import settings from rest_framework import status @@ -23,7 +23,10 @@ def post(self, request: Request) -> Response: try: organization = self.do_sync(request) except SyncException as e: - return Response(data=e.error_data, status=status.HTTP_400_BAD_REQUEST) + return Response( + data=asdict(e.error_data) if is_dataclass(e.error_data) else e.error_data, + status=status.HTTP_400_BAD_REQUEST, + ) organization.revoke_plugin() provisioned_data = organization.provision_plugin() From de476846af63fa2b4471e63e114e94f3b8983c10 Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Tue, 8 Oct 2024 14:07:00 +0100 Subject: [PATCH 4/5] Remove "Add to docs project" workflow (#5135) There's now a centralized workflow running in Writers' Toolkit that adds issues from all repositories which removes the need to proliferate per-repository workflows. --------- Co-authored-by: Joey Orlando Co-authored-by: GitHub Actions Co-authored-by: Joey Orlando Co-authored-by: Matias Bordese Co-authored-by: Vadim Stepanov Co-authored-by: Dominik Broj Co-authored-by: Michael Derynck Co-authored-by: Yulya Artyukhina Co-authored-by: Innokentii Konstantinov Co-authored-by: Ildar Iskhakov Co-authored-by: grafana-irm-app[bot] <165293418+grafana-irm-app[bot]@users.noreply.github.com> --- .github/workflows/add-to-docs-project.yml | 15 --------------- helm/oncall/Chart.yaml | 4 ++-- 2 files changed, 2 insertions(+), 17 deletions(-) delete mode 100644 .github/workflows/add-to-docs-project.yml diff --git a/.github/workflows/add-to-docs-project.yml b/.github/workflows/add-to-docs-project.yml deleted file mode 100644 index 56f57ed9ba..0000000000 --- a/.github/workflows/add-to-docs-project.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: Add to docs project -on: - issues: - types: [labeled] - pull_request: - types: [labeled] -jobs: - main: - if: ${{ github.event.label.name == 'type/docs' }} - permissions: - contents: read - id-token: write - runs-on: ubuntu-latest - steps: - - uses: grafana/writers-toolkit/add-to-docs-project@add-to-docs-project/v1 diff --git a/helm/oncall/Chart.yaml b/helm/oncall/Chart.yaml index 1362b78265..0dc048d434 100644 --- a/helm/oncall/Chart.yaml +++ b/helm/oncall/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: oncall description: Developer-friendly incident response with brilliant Slack integration type: application -version: 1.9.29 -appVersion: v1.9.29 +version: 1.10.2 +appVersion: v1.10.2 dependencies: - name: cert-manager version: v1.8.0 From 2545bf8336b681af2d7e162bcb34a4a87d8d0cf4 Mon Sep 17 00:00:00 2001 From: Michael Derynck Date: Tue, 8 Oct 2024 11:29:36 -0600 Subject: [PATCH 5/5] Split up organizations across metrics exporters (#5127) # What this PR does Limits organizations that a metrics exporter is responsible for. As more organizations are added it becomes more difficult for the exporter to deliver metrics within the scrape timeout. This would let us use the settings to divide up the organizations between multiple exporters. ## Which issue(s) this PR closes Related to [issue link here] ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- engine/apps/metrics_exporter/helpers.py | 6 +++++- engine/settings/base.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/engine/apps/metrics_exporter/helpers.py b/engine/apps/metrics_exporter/helpers.py index db1164bcb7..91a0190520 100644 --- a/engine/apps/metrics_exporter/helpers.py +++ b/engine/apps/metrics_exporter/helpers.py @@ -2,6 +2,7 @@ import random import typing +from django.conf import settings from django.core.cache import cache from django.utils import timezone @@ -50,7 +51,10 @@ def get_organization_ids(): if not organizations_ids: organizations_ids = get_organization_ids_from_db() cache.set(organizations_ids, METRICS_ORGANIZATIONS_IDS, METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT) - return organizations_ids + + group_id = settings.METRICS_EXPORTER_ORGANIZATION_GROUP_ID + group_count = settings.METRICS_EXPORTER_TOTAL_ORGANIZATION_GROUPS + return [i for i in organizations_ids if i % group_count == group_id] def is_allowed_to_start_metrics_calculation(organization_id, force=False) -> bool: diff --git a/engine/settings/base.py b/engine/settings/base.py index 88530f1b22..ae5f8716b2 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -121,6 +121,11 @@ # List of metrics to collect. Collect all available application metrics by default METRICS_TO_COLLECT = getenv_list("METRICS_TO_COLLECT", METRICS_ALL) +# Total number of exporters collecting the same set of metrics +METRICS_EXPORTER_TOTAL_ORGANIZATION_GROUPS = getenv_integer("METRICS_EXPORTER_TOTAL_ORGANIZATION_GROUPS", 1) +# ID of this exporter, used to filter which orgs to collect for +METRICS_EXPORTER_ORGANIZATION_GROUP_ID = getenv_integer("METRICS_EXPORTER_ORGANIZATION_GROUP_ID", 0) + # Database class DatabaseTypes: