fix(feedback): enforce a max message size from all sources (#79326)

Closes #76298 Closes [SENTRY-3B86](https://sentry.sentry.io/issues/5552524761/) Decided on a limit of 4096, generously below the LLM request, postgres, and kafka size limits described in the ticket. Messages that are too large will be truncated (or rejected, for crash report modal) and skip spam detection, auto-marking as spam. Includes a small refactor of `create_feedback.py`, moving stuff around and commenting a bit. ~~Renamed `auto_ignore_spam_feedback` to `set_feedback_ignored`.~~
getsentry · Oct 22, 2024 · 477e69f · 477e69f
1 parent d3db769
commit 477e69f
Show file tree

Hide file tree

Showing 11 changed files with 256 additions and 52 deletions.
diff --git a/migrations_lockfile.txt b/migrations_lockfile.txt
@@ -10,7 +10,7 @@ hybridcloud: 0016_add_control_cacheversion
 nodestore: 0002_nodestore_no_dictfield
 remote_subscriptions: 0003_drop_remote_subscription
 replays: 0004_index_together
-sentry: 0777_add_related_name_to_dashboard_permissions
+sentry: 0778_userreport_comments_max_length
 social_auth: 0002_default_auto_field
 uptime: 0017_unique_on_timeout
 workflow_engine: 0010_detector_state_unique_group
diff --git a/src/sentry/feedback/usecases/create_feedback.py b/src/sentry/feedback/usecases/create_feedback.py
@@ -7,11 +7,12 @@
 from uuid import uuid4
 
 import jsonschema
+import sentry_sdk
 
 from sentry import features, options
 from sentry.constants import DataCategory
 from sentry.eventstore.models import Event, GroupEvent
-from sentry.feedback.usecases.spam_detection import is_spam
+from sentry.feedback.usecases.spam_detection import is_spam, spam_detection_enabled
 from sentry.issues.grouptype import FeedbackGroup
 from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
 from sentry.issues.json_schemas import EVENT_PAYLOAD_SCHEMA, LEGACY_EVENT_PAYLOAD_SCHEMA
@@ -141,7 +142,6 @@ def fix_for_issue_platform(event_data):
     # If no user email was provided specify the contact-email as the user-email.
     feedback_obj = event_data.get("contexts", {}).get("feedback", {})
     contact_email = feedback_obj.get("contact_email")
-
     if not ret_event["user"].get("email", ""):
         ret_event["user"]["email"] = contact_email
 
@@ -168,6 +168,21 @@ def fix_for_issue_platform(event_data):
     return ret_event
 
 
+def validate_issue_platform_event_schema(event_data):
+    """
+    The issue platform schema validation does not run in dev atm so we have to do the validation
+    ourselves, or else our tests are not representative of what happens in prod.
+    """
+    try:
+        jsonschema.validate(event_data, EVENT_PAYLOAD_SCHEMA)
+    except jsonschema.exceptions.ValidationError:
+        try:
+            jsonschema.validate(event_data, LEGACY_EVENT_PAYLOAD_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            metrics.incr("feedback.create_feedback_issue.invalid_schema")
+            raise
+
+
 def should_filter_feedback(event, project_id, source: FeedbackCreationSource):
     # Right now all unreal error events without a feedback
     # actually get a sent a feedback with this message
@@ -222,26 +237,53 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     if should_filter_feedback(event, project_id, source):
         return
 
+    feedback_message = event["contexts"]["feedback"]["message"]
+    max_msg_size = options.get("feedback.message.max-size")  # Note options are cached.
     project = Project.objects.get_from_cache(id=project_id)
 
+    # Spam detection.
     is_message_spam = None
-    if features.has(
-        "organizations:user-feedback-spam-filter-ingest", project.organization
-    ) and project.get_option("sentry:feedback_ai_spam_detection"):
-        try:
-            is_message_spam = is_spam(event["contexts"]["feedback"]["message"])
-        except Exception:
-            # until we have LLM error types ironed out, just catch all exceptions
-            logger.exception("Error checking if message is spam", extra={"project_id": project_id})
-        metrics.incr(
-            "feedback.create_feedback_issue.spam_detection",
+    if spam_detection_enabled(project):
+        if len(feedback_message) <= max_msg_size:
+            try:
+                is_message_spam = is_spam(feedback_message)
+            except Exception:
+                # until we have LLM error types ironed out, just catch all exceptions
+                logger.exception(
+                    "Error checking if message is spam", extra={"project_id": project_id}
+                )
+            metrics.incr(
+                "feedback.create_feedback_issue.spam_detection",
+                tags={
+                    "is_spam": is_message_spam,
+                    "referrer": source.value,
+                    "client_source": event["contexts"]["feedback"].get("source"),
+                },
+                sample_rate=1.0,
+            )
+        else:
+            is_message_spam = True
+
+    if len(feedback_message) > max_msg_size:
+        metrics.distribution(
+            "feedback.large_message",
+            len(feedback_message),
             tags={
-                "is_spam": is_message_spam,
+                "entrypoint": "create_feedback_issue",
+                "referrer": source.value,
+            },
+        )
+        logger.info(
+            "Feedback message exceeds max size.",
+            extra={
+                "project_id": project_id,
+                "entrypoint": "create_feedback_issue",
                 "referrer": source.value,
-                "client_source": event["contexts"]["feedback"].get("source"),
             },
-            sample_rate=1.0,
         )
+        # Sentry will capture `feedback_message` in local variables (truncated).
+        sentry_sdk.capture_message("Feedback message exceeds max size.", "warning")
+        feedback_message = feedback_message[:max_msg_size]
 
     # Note that some of the fields below like title and subtitle
     # are not used by the feedback UI, but are required.
@@ -257,7 +299,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
         project_id=project_id,
         fingerprint=issue_fingerprint,  # random UUID for fingerprint so feedbacks are grouped individually
         issue_title="User Feedback",
-        subtitle=event["contexts"]["feedback"]["message"],
+        subtitle=feedback_message,
         resource_id=None,
         evidence_data=evidence_data,
         evidence_display=evidence_display,
@@ -279,6 +321,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     # make sure event data is valid for issue platform
     validate_issue_platform_event_schema(event_fixed)
 
+    # Analytics
     if not project.flags.has_feedbacks:
         first_feedback_received.send_robust(project=project, sender=Project)
 
@@ -291,9 +334,11 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     ):
         first_new_feedback_received.send_robust(project=project, sender=Project)
 
+    # Send to issue platform for processing.
     produce_occurrence_to_kafka(
         payload_type=PayloadType.OCCURRENCE, occurrence=occurrence, event_data=event_fixed
     )
+    # Mark as spam. We need this since IP doesn't currently support an initial status of IGNORED.
     if is_message_spam:
         auto_ignore_spam_feedbacks(project, issue_fingerprint)
     metrics.incr(
@@ -304,6 +349,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
         },
         sample_rate=1.0,
     )
+
     track_outcome(
         org_id=project.organization_id,
         project_id=project_id,
@@ -317,19 +363,27 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     )
 
 
-def validate_issue_platform_event_schema(event_data):
+def auto_ignore_spam_feedbacks(project, issue_fingerprint):
     """
-    The issue platform schema validation does not run in dev atm so we have to do the validation
-    ourselves, or else our tests are not representative of what happens in prod.
+    Marks an issue as spam with a STATUS_CHANGE kafka message. The IGNORED status allows the occurrence to skip alerts
+    and be picked up by frontend spam queries.
     """
-    try:
-        jsonschema.validate(event_data, EVENT_PAYLOAD_SCHEMA)
-    except jsonschema.exceptions.ValidationError:
-        try:
-            jsonschema.validate(event_data, LEGACY_EVENT_PAYLOAD_SCHEMA)
-        except jsonschema.exceptions.ValidationError:
-            metrics.incr("feedback.create_feedback_issue.invalid_schema")
-            raise
+    if features.has("organizations:user-feedback-spam-filter-actions", project.organization):
+        metrics.incr("feedback.spam-detection-actions.set-ignored")
+        produce_occurrence_to_kafka(
+            payload_type=PayloadType.STATUS_CHANGE,
+            status_change=StatusChangeMessage(
+                fingerprint=issue_fingerprint,
+                project_id=project.id,
+                new_status=GroupStatus.IGNORED,  # we use ignored in the UI for the spam tab
+                new_substatus=GroupSubStatus.FOREVER,
+            ),
+        )
+
+
+###########
+# Shim code
+###########
 
 
 class UserReportShimDict(TypedDict):
@@ -390,19 +444,5 @@ def shim_to_feedback(
         metrics.incr("feedback.shim_to_feedback.failed", tags={"referrer": source.value})
 
 
-def auto_ignore_spam_feedbacks(project, issue_fingerprint):
-    if features.has("organizations:user-feedback-spam-filter-actions", project.organization):
-        metrics.incr("feedback.spam-detection-actions.set-ignored")
-        produce_occurrence_to_kafka(
-            payload_type=PayloadType.STATUS_CHANGE,
-            status_change=StatusChangeMessage(
-                fingerprint=issue_fingerprint,
-                project_id=project.id,
-                new_status=GroupStatus.IGNORED,  # we use ignored in the UI for the spam tab
-                new_substatus=GroupSubStatus.FOREVER,
-            ),
-        )
-
-
 def is_in_feedback_denylist(organization):
     return organization.slug in options.get("feedback.organizations.slug-denylist")
diff --git a/src/sentry/feedback/usecases/spam_detection.py b/src/sentry/feedback/usecases/spam_detection.py
@@ -1,6 +1,8 @@
 import logging
 
+from sentry import features
 from sentry.llm.usecases import LLMUseCase, complete_prompt
+from sentry.models.project import Project
 from sentry.utils import metrics
 
 logger = logging.getLogger(__name__)
@@ -68,3 +70,9 @@ def trim_response(text):
         return True, trimmed_text
     else:
         return False, trimmed_text
+
+
+def spam_detection_enabled(project: Project) -> bool:
+    return features.has(
+        "organizations:user-feedback-spam-filter-ingest", project.organization
+    ) and project.get_option("sentry:feedback_ai_spam_detection")
diff --git a/src/sentry/ingest/userreport.py b/src/sentry/ingest/userreport.py
@@ -3,6 +3,7 @@
 import logging
 from datetime import timedelta
 
+import sentry_sdk
 from django.db import IntegrityError, router
 from django.utils import timezone
 
@@ -39,6 +40,28 @@ def save_userreport(
         if should_filter_user_report(report["comments"]):
             return
 
+        max_comment_length = UserReport._meta.get_field("comments").max_length
+        if max_comment_length and len(report["comments"]) > max_comment_length:
+            metrics.distribution(
+                "feedback.large_message",
+                len(report["comments"]),
+                tags={
+                    "entrypoint": "save_userreport",
+                    "referrer": source.value,
+                },
+            )
+            logger.info(
+                "Feedback message exceeds max size.",
+                extra={
+                    "project_id": project.id,
+                    "entrypoint": "save_userreport",
+                    "referrer": source.value,
+                },
+            )
+            # Sentry will capture `feedback_message` in local variables (truncated).
+            sentry_sdk.capture_message("Feedback message exceeds max size.", "warning")
+            report["comments"] = report["comments"][:max_comment_length]
+
         if start_time is None:
             start_time = timezone.now()
 

diff --git a/src/sentry/migrations/0778_userreport_comments_max_length.py b/src/sentry/migrations/0778_userreport_comments_max_length.py
@@ -0,0 +1,33 @@
+# Generated by Django 5.1.1 on 2024-10-17 23:33
+
+from django.db import migrations, models
+
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production.
+    # This should only be used for operations where it's safe to run the migration after your
+    # code has deployed. So this should not be used for most operations that alter the schema
+    # of a table.
+    # Here are some things that make sense to mark as post deployment:
+    # - Large data migrations. Typically we want these to be run manually so that they can be
+    #   monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   run this outside deployments so that we don't block them. Note that while adding an index
+    #   is a schema change, it's completely safe to run the operation after the code has deployed.
+    # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment
+
+    is_post_deployment = False
+
+    dependencies = [
+        ("sentry", "0777_add_related_name_to_dashboard_permissions"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="userreport",
+            name="comments",
+            field=models.TextField(max_length=4096),
+        ),
+    ]
diff --git a/src/sentry/models/userreport.py b/src/sentry/models/userreport.py
@@ -15,7 +15,9 @@ class UserReport(Model):
     environment_id = BoundedBigIntegerField(null=True, db_index=True)
     name = models.CharField(max_length=128)
     email = models.EmailField(max_length=75)
-    comments = models.TextField()
+    comments = models.TextField(
+        max_length=4096
+    )  # Keep max_length <= "feedback.message.max-size" sentry option.
     date_added = models.DateTimeField(default=timezone.now, db_index=True)
 
     class Meta:

diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py
@@ -487,6 +487,12 @@
     default=[],
     flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
 )
+register(
+    "feedback.message.max-size",
+    type=Int,
+    default=4096,
+    flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
+)
 
 # Dev Toolbar Options
 register(

diff --git a/src/sentry/web/frontend/error_page_embed.py b/src/sentry/web/frontend/error_page_embed.py
@@ -58,14 +58,16 @@
 
 class UserReportForm(forms.ModelForm):
     name = forms.CharField(
-        max_length=128, widget=forms.TextInput(attrs={"placeholder": _("Jane Bloggs")})
+        max_length=UserReport._meta.get_field("name").max_length,
+        widget=forms.TextInput(attrs={"placeholder": _("Jane Bloggs")}),
     )
     email = forms.EmailField(
-        max_length=75,
+        max_length=UserReport._meta.get_field("email").max_length,
         widget=forms.TextInput(attrs={"placeholder": _("[email protected]"), "type": "email"}),
     )
     comments = forms.CharField(
-        widget=forms.Textarea(attrs={"placeholder": _("I clicked on 'X' and then hit 'Confirm'")})
+        max_length=UserReport._meta.get_field("comments").max_length,
+        widget=forms.Textarea(attrs={"placeholder": _("I clicked on 'X' and then hit 'Confirm'")}),
     )
 
     class Meta: