Skip to content

Commit

Permalink
fix(feedback): enforce a max message size from all sources (#79326)
Browse files Browse the repository at this point in the history
Closes #76298
Closes [SENTRY-3B86](https://sentry.sentry.io/issues/5552524761/)

Decided on a limit of 4096, generously below the LLM request, postgres,
and kafka size limits described in the ticket. Messages that are too
large will be truncated (or rejected, for crash report modal) and skip
spam detection, auto-marking as spam.

Includes a small refactor of `create_feedback.py`, moving stuff around
and commenting a bit. ~~Renamed `auto_ignore_spam_feedback` to
`set_feedback_ignored`.~~
  • Loading branch information
aliu39 authored Oct 22, 2024
1 parent d3db769 commit 477e69f
Show file tree
Hide file tree
Showing 11 changed files with 256 additions and 52 deletions.
2 changes: 1 addition & 1 deletion migrations_lockfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ hybridcloud: 0016_add_control_cacheversion
nodestore: 0002_nodestore_no_dictfield
remote_subscriptions: 0003_drop_remote_subscription
replays: 0004_index_together
sentry: 0777_add_related_name_to_dashboard_permissions
sentry: 0778_userreport_comments_max_length
social_auth: 0002_default_auto_field
uptime: 0017_unique_on_timeout
workflow_engine: 0010_detector_state_unique_group
122 changes: 81 additions & 41 deletions src/sentry/feedback/usecases/create_feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
from uuid import uuid4

import jsonschema
import sentry_sdk

from sentry import features, options
from sentry.constants import DataCategory
from sentry.eventstore.models import Event, GroupEvent
from sentry.feedback.usecases.spam_detection import is_spam
from sentry.feedback.usecases.spam_detection import is_spam, spam_detection_enabled
from sentry.issues.grouptype import FeedbackGroup
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.issues.json_schemas import EVENT_PAYLOAD_SCHEMA, LEGACY_EVENT_PAYLOAD_SCHEMA
Expand Down Expand Up @@ -141,7 +142,6 @@ def fix_for_issue_platform(event_data):
# If no user email was provided specify the contact-email as the user-email.
feedback_obj = event_data.get("contexts", {}).get("feedback", {})
contact_email = feedback_obj.get("contact_email")

if not ret_event["user"].get("email", ""):
ret_event["user"]["email"] = contact_email

Expand All @@ -168,6 +168,21 @@ def fix_for_issue_platform(event_data):
return ret_event


def validate_issue_platform_event_schema(event_data):
"""
The issue platform schema validation does not run in dev atm so we have to do the validation
ourselves, or else our tests are not representative of what happens in prod.
"""
try:
jsonschema.validate(event_data, EVENT_PAYLOAD_SCHEMA)
except jsonschema.exceptions.ValidationError:
try:
jsonschema.validate(event_data, LEGACY_EVENT_PAYLOAD_SCHEMA)
except jsonschema.exceptions.ValidationError:
metrics.incr("feedback.create_feedback_issue.invalid_schema")
raise


def should_filter_feedback(event, project_id, source: FeedbackCreationSource):
# Right now all unreal error events without a feedback
# actually get a sent a feedback with this message
Expand Down Expand Up @@ -222,26 +237,53 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
if should_filter_feedback(event, project_id, source):
return

feedback_message = event["contexts"]["feedback"]["message"]
max_msg_size = options.get("feedback.message.max-size") # Note options are cached.
project = Project.objects.get_from_cache(id=project_id)

# Spam detection.
is_message_spam = None
if features.has(
"organizations:user-feedback-spam-filter-ingest", project.organization
) and project.get_option("sentry:feedback_ai_spam_detection"):
try:
is_message_spam = is_spam(event["contexts"]["feedback"]["message"])
except Exception:
# until we have LLM error types ironed out, just catch all exceptions
logger.exception("Error checking if message is spam", extra={"project_id": project_id})
metrics.incr(
"feedback.create_feedback_issue.spam_detection",
if spam_detection_enabled(project):
if len(feedback_message) <= max_msg_size:
try:
is_message_spam = is_spam(feedback_message)
except Exception:
# until we have LLM error types ironed out, just catch all exceptions
logger.exception(
"Error checking if message is spam", extra={"project_id": project_id}
)
metrics.incr(
"feedback.create_feedback_issue.spam_detection",
tags={
"is_spam": is_message_spam,
"referrer": source.value,
"client_source": event["contexts"]["feedback"].get("source"),
},
sample_rate=1.0,
)
else:
is_message_spam = True

if len(feedback_message) > max_msg_size:
metrics.distribution(
"feedback.large_message",
len(feedback_message),
tags={
"is_spam": is_message_spam,
"entrypoint": "create_feedback_issue",
"referrer": source.value,
},
)
logger.info(
"Feedback message exceeds max size.",
extra={
"project_id": project_id,
"entrypoint": "create_feedback_issue",
"referrer": source.value,
"client_source": event["contexts"]["feedback"].get("source"),
},
sample_rate=1.0,
)
# Sentry will capture `feedback_message` in local variables (truncated).
sentry_sdk.capture_message("Feedback message exceeds max size.", "warning")
feedback_message = feedback_message[:max_msg_size]

# Note that some of the fields below like title and subtitle
# are not used by the feedback UI, but are required.
Expand All @@ -257,7 +299,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
project_id=project_id,
fingerprint=issue_fingerprint, # random UUID for fingerprint so feedbacks are grouped individually
issue_title="User Feedback",
subtitle=event["contexts"]["feedback"]["message"],
subtitle=feedback_message,
resource_id=None,
evidence_data=evidence_data,
evidence_display=evidence_display,
Expand All @@ -279,6 +321,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
# make sure event data is valid for issue platform
validate_issue_platform_event_schema(event_fixed)

# Analytics
if not project.flags.has_feedbacks:
first_feedback_received.send_robust(project=project, sender=Project)

Expand All @@ -291,9 +334,11 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
):
first_new_feedback_received.send_robust(project=project, sender=Project)

# Send to issue platform for processing.
produce_occurrence_to_kafka(
payload_type=PayloadType.OCCURRENCE, occurrence=occurrence, event_data=event_fixed
)
# Mark as spam. We need this since IP doesn't currently support an initial status of IGNORED.
if is_message_spam:
auto_ignore_spam_feedbacks(project, issue_fingerprint)
metrics.incr(
Expand All @@ -304,6 +349,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
},
sample_rate=1.0,
)

track_outcome(
org_id=project.organization_id,
project_id=project_id,
Expand All @@ -317,19 +363,27 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
)


def validate_issue_platform_event_schema(event_data):
def auto_ignore_spam_feedbacks(project, issue_fingerprint):
"""
The issue platform schema validation does not run in dev atm so we have to do the validation
ourselves, or else our tests are not representative of what happens in prod.
Marks an issue as spam with a STATUS_CHANGE kafka message. The IGNORED status allows the occurrence to skip alerts
and be picked up by frontend spam queries.
"""
try:
jsonschema.validate(event_data, EVENT_PAYLOAD_SCHEMA)
except jsonschema.exceptions.ValidationError:
try:
jsonschema.validate(event_data, LEGACY_EVENT_PAYLOAD_SCHEMA)
except jsonschema.exceptions.ValidationError:
metrics.incr("feedback.create_feedback_issue.invalid_schema")
raise
if features.has("organizations:user-feedback-spam-filter-actions", project.organization):
metrics.incr("feedback.spam-detection-actions.set-ignored")
produce_occurrence_to_kafka(
payload_type=PayloadType.STATUS_CHANGE,
status_change=StatusChangeMessage(
fingerprint=issue_fingerprint,
project_id=project.id,
new_status=GroupStatus.IGNORED, # we use ignored in the UI for the spam tab
new_substatus=GroupSubStatus.FOREVER,
),
)


###########
# Shim code
###########


class UserReportShimDict(TypedDict):
Expand Down Expand Up @@ -390,19 +444,5 @@ def shim_to_feedback(
metrics.incr("feedback.shim_to_feedback.failed", tags={"referrer": source.value})


def auto_ignore_spam_feedbacks(project, issue_fingerprint):
if features.has("organizations:user-feedback-spam-filter-actions", project.organization):
metrics.incr("feedback.spam-detection-actions.set-ignored")
produce_occurrence_to_kafka(
payload_type=PayloadType.STATUS_CHANGE,
status_change=StatusChangeMessage(
fingerprint=issue_fingerprint,
project_id=project.id,
new_status=GroupStatus.IGNORED, # we use ignored in the UI for the spam tab
new_substatus=GroupSubStatus.FOREVER,
),
)


def is_in_feedback_denylist(organization):
return organization.slug in options.get("feedback.organizations.slug-denylist")
8 changes: 8 additions & 0 deletions src/sentry/feedback/usecases/spam_detection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging

from sentry import features
from sentry.llm.usecases import LLMUseCase, complete_prompt
from sentry.models.project import Project
from sentry.utils import metrics

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -68,3 +70,9 @@ def trim_response(text):
return True, trimmed_text
else:
return False, trimmed_text


def spam_detection_enabled(project: Project) -> bool:
return features.has(
"organizations:user-feedback-spam-filter-ingest", project.organization
) and project.get_option("sentry:feedback_ai_spam_detection")
23 changes: 23 additions & 0 deletions src/sentry/ingest/userreport.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
from datetime import timedelta

import sentry_sdk
from django.db import IntegrityError, router
from django.utils import timezone

Expand Down Expand Up @@ -39,6 +40,28 @@ def save_userreport(
if should_filter_user_report(report["comments"]):
return

max_comment_length = UserReport._meta.get_field("comments").max_length
if max_comment_length and len(report["comments"]) > max_comment_length:
metrics.distribution(
"feedback.large_message",
len(report["comments"]),
tags={
"entrypoint": "save_userreport",
"referrer": source.value,
},
)
logger.info(
"Feedback message exceeds max size.",
extra={
"project_id": project.id,
"entrypoint": "save_userreport",
"referrer": source.value,
},
)
# Sentry will capture `feedback_message` in local variables (truncated).
sentry_sdk.capture_message("Feedback message exceeds max size.", "warning")
report["comments"] = report["comments"][:max_comment_length]

if start_time is None:
start_time = timezone.now()

Expand Down
33 changes: 33 additions & 0 deletions src/sentry/migrations/0778_userreport_comments_max_length.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 5.1.1 on 2024-10-17 23:33

from django.db import migrations, models

from sentry.new_migrations.migrations import CheckedMigration


class Migration(CheckedMigration):
# This flag is used to mark that a migration shouldn't be automatically run in production.
# This should only be used for operations where it's safe to run the migration after your
# code has deployed. So this should not be used for most operations that alter the schema
# of a table.
# Here are some things that make sense to mark as post deployment:
# - Large data migrations. Typically we want these to be run manually so that they can be
# monitored and not block the deploy for a long period of time while they run.
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
# run this outside deployments so that we don't block them. Note that while adding an index
# is a schema change, it's completely safe to run the operation after the code has deployed.
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment

is_post_deployment = False

dependencies = [
("sentry", "0777_add_related_name_to_dashboard_permissions"),
]

operations = [
migrations.AlterField(
model_name="userreport",
name="comments",
field=models.TextField(max_length=4096),
),
]
4 changes: 3 additions & 1 deletion src/sentry/models/userreport.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ class UserReport(Model):
environment_id = BoundedBigIntegerField(null=True, db_index=True)
name = models.CharField(max_length=128)
email = models.EmailField(max_length=75)
comments = models.TextField()
comments = models.TextField(
max_length=4096
) # Keep max_length <= "feedback.message.max-size" sentry option.
date_added = models.DateTimeField(default=timezone.now, db_index=True)

class Meta:
Expand Down
6 changes: 6 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,12 @@
default=[],
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)
register(
"feedback.message.max-size",
type=Int,
default=4096,
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)

# Dev Toolbar Options
register(
Expand Down
8 changes: 5 additions & 3 deletions src/sentry/web/frontend/error_page_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,16 @@

class UserReportForm(forms.ModelForm):
name = forms.CharField(
max_length=128, widget=forms.TextInput(attrs={"placeholder": _("Jane Bloggs")})
max_length=UserReport._meta.get_field("name").max_length,
widget=forms.TextInput(attrs={"placeholder": _("Jane Bloggs")}),
)
email = forms.EmailField(
max_length=75,
max_length=UserReport._meta.get_field("email").max_length,
widget=forms.TextInput(attrs={"placeholder": _("[email protected]"), "type": "email"}),
)
comments = forms.CharField(
widget=forms.Textarea(attrs={"placeholder": _("I clicked on 'X' and then hit 'Confirm'")})
max_length=UserReport._meta.get_field("comments").max_length,
widget=forms.Textarea(attrs={"placeholder": _("I clicked on 'X' and then hit 'Confirm'")}),
)

class Meta:
Expand Down
Loading

0 comments on commit 477e69f

Please sign in to comment.