From 307254b63df523f1ac25b7aba64893bc78193984 Mon Sep 17 00:00:00 2001 From: Famlam Date: Fri, 25 Aug 2023 22:21:58 +0200 Subject: [PATCH] Detect tracking parameters in URLs --- plugins/Website.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/plugins/Website.py b/plugins/Website.py index 3e4618ba7..ae7f94598 100644 --- a/plugins/Website.py +++ b/plugins/Website.py @@ -22,6 +22,9 @@ from modules.OsmoseTranslation import T_ from plugins.Plugin import Plugin from modules.Stablehash import stablehash64 +from urllib.parse import urlsplit, urlunsplit +from modules.downloader import urlread +import json class Website(Plugin): @@ -39,15 +42,28 @@ def init(self, logger): import re # From RFC 1738 paragraph 2.1 self.HasScheme = re.compile(r"^[a-zA-Z0-9.+-]+://") + self.strippable_queryparameters = self._load_trackingparameter_list() self.errors[30931] = self.def_class(item = 3093, level = 2, tags = ['value', 'fix:chair'], title = T_('The URL contains a space')) self.errors[30932] = self.def_class(item = 3093, level = 3, tags = ['value', 'fix:chair'], title = T_('The URL does not have a valid scheme')) + self.errors[30933] = self.def_class(item = 3093, level = 3, tags = ['value', 'fix:chair'], + title = T_('Invalid URL')) + self.errors[30934] = self.def_class(item = 3093, level = 3, tags = ['value', 'fix:chair'], + title = T_('Tracking parameter in URL'), + fix = T_('Strip the tracking parameter from the URL. Verify that the URL still works afterwards.'), + resource = 'https://github.com/duckduckgo/privacy-configuration/blob/main/features/tracking-parameters.json') def _bad_url(self, tag, tags): return T_("Bad URL {k}={v}", k = tag, v = tags[tag]) + def _load_trackingparameter_list(self): + # Available via CC BY-NC-SA 4.0 license from Duck Duck Go, Inc. + url = "https://raw.githubusercontent.com/duckduckgo/privacy-configuration/main/features/tracking-parameters.json" + json_full = json.loads(urlread(url, 30)) + return json_full["settings"]["parameters"] + def check(self, tags): err = [] for tag in self.URL_TAGS: @@ -66,9 +82,25 @@ def check(self, tags): stripped = True if self.HasScheme.match(url): - if stripped: + try: + parsed_url = urlsplit(url) + except ValueError as e: + err.append({"class": 30933, "subclass": stablehash64(tag), "text": T_('Bad URL in `{0}`: {1}', tag, str(e))}) + continue + queryparams = parsed_url.query.split("&") # not parse_qs/parse_qsl because we don't want to change whether i.e. + is encoded in the fix + if any(map(lambda qs: qs.split("=")[0] in self.strippable_queryparameters, queryparams)): + stripped_query = '&'.join(list(filter(lambda qs: qs.split("=")[0] not in self.strippable_queryparameters, queryparams))) + parsed_url = parsed_url._replace(query = stripped_query) + err.append({ + "class": 30934, "subclass": stablehash64(tag), + "text": T_('Tracking parameter in {0}', tag), + "fix": [{"~": {tag: urlunsplit(parsed_url)}}] + }) + elif stripped: err.append({"class": 30931, "fix": {tag: url}}) continue + + # Scheme is missing elif url.startswith('://'): url = url[3:] elif ':' in url or '//' in url: @@ -112,6 +144,17 @@ def test(self): self.assertEqual(err[0]["fix"][0]["website"], "https://{0}".format(test_url)) self.assertEqual(err[0]["fix"][1]["website"], "http://{0}".format(test_url)) + # Assure bad URLs that give an ValueError in urlsplit are caught + self.check_err(p.node(None, {"website": "http://1111:2222:aaaa:bbb::1111]/"})) + + # Detect and strip tracker parameters + err = p.node(None, {"website": "https://osmose.osmose/osmose?osmose=osmose&fb_source=abcdefghijkl&osmose2=test+%2Btest%20test&osmose3=&osmose4"}) + self.check_err(err) + self.assertEqual(err[0]["fix"][0]["~"]["website"], "https://osmose.osmose/osmose?osmose=osmose&osmose2=test+%2Btest%20test&osmose3=&osmose4") + err = p.node(None, {"website": "https://osmose.osmose/osmose/osmose/?ga_campaign=abcdefghijkl#osmose"}) + self.check_err(err) + self.assertEqual(err[0]["fix"][0]["~"]["website"], "https://osmose.osmose/osmose/osmose/#osmose") + # Verify we get no error for other correct URLs for good in ("ftp://{0}".format(test_url), "http://{0}".format(test_url),