From c46bf440a48089691f30831fb6a72692a61760c2 Mon Sep 17 00:00:00 2001 From: Primoz Godec Date: Fri, 17 Jun 2022 15:00:57 +0200 Subject: [PATCH 1/2] Embedders - temporarily fix get_proxy function --- orangecontrib/text/__init__.py | 37 +++++++++++++++++++ .../text/tests/test_documentembedder.py | 10 +++++ 2 files changed, 47 insertions(+) diff --git a/orangecontrib/text/__init__.py b/orangecontrib/text/__init__.py index a402a5283..3a5e1bf5d 100644 --- a/orangecontrib/text/__init__.py +++ b/orangecontrib/text/__init__.py @@ -1,5 +1,42 @@ # Set where NLTK data is downloaded import os + +# temporary solution - remove when Orange 3.33 is released +# it must be imported before nltk_data_dir +from typing import Optional, Dict +from Orange.misc.utils import embedder_utils + + +def _get_proxies() -> Optional[Dict[str, str]]: + """ + Return dict with proxy addresses if they exist. + Returns + ------- + proxy_dict + Dictionary with format {proxy type: proxy address} or None if + they not set. + """ + def add_scheme(url: Optional[str]) -> Optional[str]: + if url is not None and "://" not in url: + # if no scheme default to http - as other libraries do (e.g. requests) + return f"http://{url}" + else: + return url + + http_proxy = add_scheme(os.environ.get("http_proxy")) + https_proxy = add_scheme(os.environ.get("https_proxy")) + proxy_dict = {} + if http_proxy: + proxy_dict["http://"] = http_proxy + if https_proxy: + proxy_dict["https://"] = https_proxy + return proxy_dict if proxy_dict else None + + +embedder_utils.get_proxies = _get_proxies +# remove to here + + from orangecontrib.text.misc import nltk_data_dir os.environ['NLTK_DATA'] = nltk_data_dir() diff --git a/orangecontrib/text/tests/test_documentembedder.py b/orangecontrib/text/tests/test_documentembedder.py index f77527368..eff07d503 100644 --- a/orangecontrib/text/tests/test_documentembedder.py +++ b/orangecontrib/text/tests/test_documentembedder.py @@ -144,6 +144,16 @@ def test_invalid_parameters(self): with self.assertRaises(ValueError): self.embedder = DocumentEmbedder(aggregator='average') + def test_remove_temporary_proxy_solution(self): + """ + When it starts to fail: + - remove this test + - remove temporary implementation of get_proxy() function in text.__inint__ + - set minimum version of Orange on 3.33 + """ + import Orange + self.assertGreater("3.34.0", Orange.__version__) + if __name__ == "__main__": unittest.main() From b2b3e4dde15181b68d984fd639fd32f37d2b1236 Mon Sep 17 00:00:00 2001 From: Primoz Godec Date: Fri, 17 Jun 2022 15:47:22 +0200 Subject: [PATCH 2/2] NLTK proxy - add port if not present --- orangecontrib/text/misc/nltk_data_download.py | 45 +++++++++++++++---- .../text/tests/test_nltk_download.py | 36 +++++++++++++++ 2 files changed, 72 insertions(+), 9 deletions(-) create mode 100644 orangecontrib/text/tests/test_nltk_download.py diff --git a/orangecontrib/text/misc/nltk_data_download.py b/orangecontrib/text/misc/nltk_data_download.py index 033856fbf..781c5b8e9 100644 --- a/orangecontrib/text/misc/nltk_data_download.py +++ b/orangecontrib/text/misc/nltk_data_download.py @@ -4,6 +4,7 @@ import time from functools import wraps from threading import Thread +from urllib.parse import urlparse, ParseResult import nltk from Orange.misc.environ import data_dir_base @@ -38,18 +39,44 @@ def nltk_data_dir(): is_done_loading = False -def _download_nltk_data(): - global is_done_loading +# for any other potential scheme, it should be provided by user +DEFAULT_PORTS = { + "http": "80", + "https": "443", + "socks4": "1080", + "socks": "1080", + "quic": "443", +} - # set proxy if exist + +def _get_proxy_address(): + """ + Set proxy addresses for NLTK since NLTK do not use proxy addresses from + https_proxy environment variable + """ proxies = get_proxies() or {} - # use https if exists and others otherwise - for key in ("https://", "all://", "http://"): - if key in proxies: - log.debug(f"Using proxy for NLTK: {proxies[key]}") - nltk.set_proxy(proxies[key]) - break + # nltk uses https to download data + if "https://" in proxies: + proxy = urlparse(proxies['https://']) + log.debug(f"Using proxy for NLTK: {proxy}") + port = proxy.port or DEFAULT_PORTS.get(proxy.scheme) + url = ParseResult( + scheme=proxy.scheme, + netloc="{}:{}".format(proxy.hostname, port) if port else proxy.netloc, + path=proxy.path, + params=proxy.params, + query=proxy.query, + fragment=proxy.fragment + ).geturl() + return url + + +def _download_nltk_data(): + global is_done_loading + proxy_address = _get_proxy_address() + if proxy_address: + nltk.set_proxy(proxy_address) nltk.download(NLTK_DATA, download_dir=nltk_data_dir(), quiet=True) is_done_loading = True sys.stdout.flush() diff --git a/orangecontrib/text/tests/test_nltk_download.py b/orangecontrib/text/tests/test_nltk_download.py new file mode 100644 index 000000000..10ee68fe7 --- /dev/null +++ b/orangecontrib/text/tests/test_nltk_download.py @@ -0,0 +1,36 @@ +import os +import unittest +from orangecontrib.text.misc.nltk_data_download import _get_proxy_address + + +class TestNLTKDownload(unittest.TestCase): + def setUp(self) -> None: + self.previous_https = os.environ.get("https_proxy") + os.environ.pop("https_proxy", None) + + def tearDown(self) -> None: + os.environ.pop("https_proxy", None) + if self.previous_https is not None: + os.environ["https_proxy"] = self.previous_https + + def test_get_proxy_address(self): + self.assertIsNone(_get_proxy_address()) + + os.environ["https_proxy"] = "https://test.com" + self.assertEqual("https://test.com:443", _get_proxy_address()) + + os.environ["https_proxy"] = "https://test.com:12" + self.assertEqual("https://test.com:12", _get_proxy_address()) + + os.environ["https_proxy"] = "https://test.com/test" + self.assertEqual("https://test.com:443/test", _get_proxy_address()) + + os.environ["https_proxy"] = "https://test.com/test?a=2" + self.assertEqual("https://test.com:443/test?a=2", _get_proxy_address()) + + os.environ["https_proxy"] = "test.com/test?a=2" + self.assertEqual("http://test.com:80/test?a=2", _get_proxy_address()) + + +if __name__ == "__main__": + unittest.main()