Skip to content

Commit

Permalink
Detect tracking parameters in URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
Famlam committed Sep 26, 2023
1 parent 6138f0d commit 307254b
Showing 1 changed file with 44 additions and 1 deletion.
45 changes: 44 additions & 1 deletion plugins/Website.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
from modules.OsmoseTranslation import T_
from plugins.Plugin import Plugin
from modules.Stablehash import stablehash64
from urllib.parse import urlsplit, urlunsplit
from modules.downloader import urlread
import json


class Website(Plugin):
Expand All @@ -39,15 +42,28 @@ def init(self, logger):
import re
# From RFC 1738 paragraph 2.1
self.HasScheme = re.compile(r"^[a-zA-Z0-9.+-]+://")
self.strippable_queryparameters = self._load_trackingparameter_list()

self.errors[30931] = self.def_class(item = 3093, level = 2, tags = ['value', 'fix:chair'],
title = T_('The URL contains a space'))
self.errors[30932] = self.def_class(item = 3093, level = 3, tags = ['value', 'fix:chair'],
title = T_('The URL does not have a valid scheme'))
self.errors[30933] = self.def_class(item = 3093, level = 3, tags = ['value', 'fix:chair'],
title = T_('Invalid URL'))
self.errors[30934] = self.def_class(item = 3093, level = 3, tags = ['value', 'fix:chair'],
title = T_('Tracking parameter in URL'),
fix = T_('Strip the tracking parameter from the URL. Verify that the URL still works afterwards.'),
resource = 'https://github.com/duckduckgo/privacy-configuration/blob/main/features/tracking-parameters.json')

def _bad_url(self, tag, tags):
return T_("Bad URL {k}={v}", k = tag, v = tags[tag])

def _load_trackingparameter_list(self):
# Available via CC BY-NC-SA 4.0 license from Duck Duck Go, Inc.
url = "https://raw.githubusercontent.com/duckduckgo/privacy-configuration/main/features/tracking-parameters.json"
json_full = json.loads(urlread(url, 30))
return json_full["settings"]["parameters"]

def check(self, tags):
err = []
for tag in self.URL_TAGS:
Expand All @@ -66,9 +82,25 @@ def check(self, tags):
stripped = True

if self.HasScheme.match(url):
if stripped:
try:
parsed_url = urlsplit(url)
except ValueError as e:
err.append({"class": 30933, "subclass": stablehash64(tag), "text": T_('Bad URL in `{0}`: {1}', tag, str(e))})
continue
queryparams = parsed_url.query.split("&") # not parse_qs/parse_qsl because we don't want to change whether i.e. + is encoded in the fix
if any(map(lambda qs: qs.split("=")[0] in self.strippable_queryparameters, queryparams)):
stripped_query = '&'.join(list(filter(lambda qs: qs.split("=")[0] not in self.strippable_queryparameters, queryparams)))
parsed_url = parsed_url._replace(query = stripped_query)
err.append({
"class": 30934, "subclass": stablehash64(tag),
"text": T_('Tracking parameter in {0}', tag),
"fix": [{"~": {tag: urlunsplit(parsed_url)}}]
})
elif stripped:
err.append({"class": 30931, "fix": {tag: url}})
continue

# Scheme is missing
elif url.startswith('://'):
url = url[3:]
elif ':' in url or '//' in url:
Expand Down Expand Up @@ -112,6 +144,17 @@ def test(self):
self.assertEqual(err[0]["fix"][0]["website"], "https://{0}".format(test_url))
self.assertEqual(err[0]["fix"][1]["website"], "http://{0}".format(test_url))

# Assure bad URLs that give an ValueError in urlsplit are caught
self.check_err(p.node(None, {"website": "http://1111:2222:aaaa:bbb::1111]/"}))

# Detect and strip tracker parameters
err = p.node(None, {"website": "https://osmose.osmose/osmose?osmose=osmose&fb_source=abcdefghijkl&osmose2=test+%2Btest%20test&osmose3=&osmose4"})
self.check_err(err)
self.assertEqual(err[0]["fix"][0]["~"]["website"], "https://osmose.osmose/osmose?osmose=osmose&osmose2=test+%2Btest%20test&osmose3=&osmose4")
err = p.node(None, {"website": "https://osmose.osmose/osmose/osmose/?ga_campaign=abcdefghijkl#osmose"})
self.check_err(err)
self.assertEqual(err[0]["fix"][0]["~"]["website"], "https://osmose.osmose/osmose/osmose/#osmose")

# Verify we get no error for other correct URLs
for good in ("ftp://{0}".format(test_url),
"http://{0}".format(test_url),
Expand Down

0 comments on commit 307254b

Please sign in to comment.