diff --git a/Orange/data/io_base.py b/Orange/data/io_base.py index 71ae66295e9..52d006de545 100644 --- a/Orange/data/io_base.py +++ b/Orange/data/io_base.py @@ -19,7 +19,7 @@ from Orange.data import Table, Domain, Variable, DiscreteVariable, \ StringVariable, ContinuousVariable, TimeVariable from Orange.data.io_util import Compression, open_compressed, \ - isnastr, guess_data_type, sanitize_variable + isnastr, guess_data_type, sanitize_variable, natural_sorted from Orange.data.util import get_unique_names_duplicates from Orange.data.variable import VariableMeta from Orange.util import Registry, flatten, namegen @@ -278,7 +278,7 @@ def _disc_column(data: np.ndarray, col: int) -> \ def _disc_no_vals_column(data: np.ndarray, col: int, **_) -> \ _ColumnProperties: vals, coltype = _TableBuilder._disc_column(data, col) - return _ColumnProperties(valuemap=sorted(set(vals) - {""}), + return _ColumnProperties(valuemap=natural_sorted(set(vals) - {""}), values=vals, coltype=coltype, orig_values=vals) diff --git a/Orange/data/io_util.py b/Orange/data/io_util.py index f79cbda04e0..5fdee215f91 100644 --- a/Orange/data/io_util.py +++ b/Orange/data/io_util.py @@ -1,5 +1,7 @@ import subprocess +import re from collections import defaultdict +from typing import List import numpy as np from chardet.universaldetector import UniversalDetector @@ -111,6 +113,36 @@ def isnastr(arr, out=None): return __isnastr(arr, out=out) +def natural_sorted(values: List) -> List: + """ + Sort values with natural sort or human order - [sth1, sth2, sth10] or + [1, 2, 10] + + Parameters + ---------- + values + List with values to sort + + Returns + ------- + List with sorted values + """ + def atoi(text): + return int(text) if text.isdigit() else text + + def natural_keys(element): + """ + alist.sort(key=natural_keys) or sorted(alist, key=natural_keys) sorts + in human order + """ + if isinstance(element, (str, bytes)): + return [atoi(c) for c in re.split(r'(\d+)', element)] + else: + return element + + return sorted(values, key=natural_keys) + + def guess_data_type(orig_values, namask=None): """ Use heuristics to guess data type. @@ -121,7 +153,7 @@ def guess_data_type(orig_values, namask=None): if namask is None: namask = isnastr(orig_values) if is_discrete: - valuemap = sorted(is_discrete) + valuemap = natural_sorted(is_discrete) coltype = DiscreteVariable else: # try to parse as float diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index b4653f5401d..ecf0fbbe709 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -3,7 +3,7 @@ from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \ TimeVariable -from Orange.data.io_util import guess_data_type +from Orange.data.io_util import guess_data_type, natural_sorted class TestTableFilters(unittest.TestCase): @@ -42,7 +42,7 @@ def test_guess_data_type_discrete(self): in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76 valuemap, values, coltype = guess_data_type(in_values) self.assertEqual(DiscreteVariable, coltype) - self.assertEqual(sorted(set(in_values)), valuemap) + self.assertEqual(natural_sorted(set(in_values)), valuemap) np.testing.assert_array_equal(in_values, values) def test_guess_data_type_string(self): @@ -93,3 +93,54 @@ def test_guess_data_type_time(self): valuemap, _, coltype = guess_data_type(in_values) self.assertEqual(TimeVariable, coltype) self.assertIsNone(valuemap) + + def test_guess_data_type_values_order(self): + """ + Test if values are ordered naturally + """ + in_values = [ + "something1", "something12", "something2", "something1", + "something20", "something1", "something2", "something12", + "something1", "something12" + ] + res = ["something1", "something2", "something12", "something20"] + valuemap, _, coltype = guess_data_type(in_values) + self.assertEqual(DiscreteVariable, coltype) + self.assertListEqual(res, valuemap) + + +class TestUtils(unittest.TestCase): + + def test_natural_sorted(self): + data = [ + "something1", + "something20", + "something2", + "something12" + ] + res = [ + "something1", + "something2", + "something12", + "something20" + ] + self.assertListEqual(res, natural_sorted(data)) + + def test_natural_sorted_text(self): + data = ["b", "aa", "c", "dd"] + res = ["aa", "b", "c", "dd"] + self.assertListEqual(res, natural_sorted(data)) + + def test_natural_sorted_numbers_str(self): + data = ["1", "20", "2", "12"] + res = ["1", "2", "12", "20"] + self.assertListEqual(res, natural_sorted(data)) + + def test_natural_sorted_numbers(self): + data = [1, 20, 2, 12] + res = [1, 2, 12, 20] + self.assertListEqual(res, natural_sorted(data)) + + +if __name__ == "__main__": + unittest.main() diff --git a/Orange/widgets/utils/domaineditor.py b/Orange/widgets/utils/domaineditor.py index 1b7a47cb7dd..f8bbc86c444 100644 --- a/Orange/widgets/utils/domaineditor.py +++ b/Orange/widgets/utils/domaineditor.py @@ -10,6 +10,7 @@ from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \ TimeVariable, Domain +from Orange.data.io_util import natural_sorted from Orange.data.util import get_unique_names_duplicates from Orange.statistics.util import unique from Orange.widgets import gui @@ -326,7 +327,10 @@ def numbers_are_round(var, col_data): elif tpe == type(orig_var): var = orig_var.copy(name=new_name) elif tpe == DiscreteVariable: - values = list(str(i) for i in unique(col_data) if not self._is_missing(i)) + values = natural_sorted( + list(str(i) for i in unique(col_data) + if not self._is_missing(i)) + ) round_numbers = numbers_are_round(orig_var, col_data) col_data = [np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data)]