Skip to content

Commit

Permalink
Sort values naturally when reading files
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed May 19, 2020
1 parent 56a106f commit 626b4d3
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 6 deletions.
4 changes: 2 additions & 2 deletions Orange/data/io_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from Orange.data import Table, Domain, Variable, DiscreteVariable, \
StringVariable, ContinuousVariable, TimeVariable
from Orange.data.io_util import Compression, open_compressed, \
isnastr, guess_data_type, sanitize_variable
isnastr, guess_data_type, sanitize_variable, natural_sorted
from Orange.data.util import get_unique_names_duplicates
from Orange.data.variable import VariableMeta
from Orange.util import Registry, flatten, namegen
Expand Down Expand Up @@ -278,7 +278,7 @@ def _disc_column(data: np.ndarray, col: int) -> \
def _disc_no_vals_column(data: np.ndarray, col: int, **_) -> \
_ColumnProperties:
vals, coltype = _TableBuilder._disc_column(data, col)
return _ColumnProperties(valuemap=sorted(set(vals) - {""}),
return _ColumnProperties(valuemap=natural_sorted(set(vals) - {""}),
values=vals, coltype=coltype,
orig_values=vals)

Expand Down
34 changes: 33 additions & 1 deletion Orange/data/io_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import subprocess
import re
from collections import defaultdict
from typing import List

import numpy as np
from chardet.universaldetector import UniversalDetector
Expand Down Expand Up @@ -111,6 +113,36 @@ def isnastr(arr, out=None):
return __isnastr(arr, out=out)


def natural_sorted(values: List) -> List:
"""
Sort values with natural sort or human order - [sth1, sth2, sth10] or
[1, 2, 10]
Parameters
----------
values
List with values to sort
Returns
-------
List with sorted values
"""
def atoi(text):
return int(text) if text.isdigit() else text

def natural_keys(element):
"""
alist.sort(key=natural_keys) or sorted(alist, key=natural_keys) sorts
in human order
"""
if isinstance(element, (str, bytes)):
return [atoi(c) for c in re.split(r'(\d+)', element)]
else:
return element

return sorted(values, key=natural_keys)


def guess_data_type(orig_values, namask=None):
"""
Use heuristics to guess data type.
Expand All @@ -121,7 +153,7 @@ def guess_data_type(orig_values, namask=None):
if namask is None:
namask = isnastr(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
valuemap = natural_sorted(is_discrete)
coltype = DiscreteVariable
else:
# try to parse as float
Expand Down
55 changes: 53 additions & 2 deletions Orange/data/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
TimeVariable
from Orange.data.io_util import guess_data_type
from Orange.data.io_util import guess_data_type, natural_sorted


class TestTableFilters(unittest.TestCase):
Expand Down Expand Up @@ -42,7 +42,7 @@ def test_guess_data_type_discrete(self):
in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(DiscreteVariable, coltype)
self.assertEqual(sorted(set(in_values)), valuemap)
self.assertEqual(natural_sorted(set(in_values)), valuemap)
np.testing.assert_array_equal(in_values, values)

def test_guess_data_type_string(self):
Expand Down Expand Up @@ -93,3 +93,54 @@ def test_guess_data_type_time(self):
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)

def test_guess_data_type_values_order(self):
"""
Test if values are ordered naturally
"""
in_values = [
"something1", "something12", "something2", "something1",
"something20", "something1", "something2", "something12",
"something1", "something12"
]
res = ["something1", "something2", "something12", "something20"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(DiscreteVariable, coltype)
self.assertListEqual(res, valuemap)


class TestUtils(unittest.TestCase):

def test_natural_sorted(self):
data = [
"something1",
"something20",
"something2",
"something12"
]
res = [
"something1",
"something2",
"something12",
"something20"
]
self.assertListEqual(res, natural_sorted(data))

def test_natural_sorted_text(self):
data = ["b", "aa", "c", "dd"]
res = ["aa", "b", "c", "dd"]
self.assertListEqual(res, natural_sorted(data))

def test_natural_sorted_numbers_str(self):
data = ["1", "20", "2", "12"]
res = ["1", "2", "12", "20"]
self.assertListEqual(res, natural_sorted(data))

def test_natural_sorted_numbers(self):
data = [1, 20, 2, 12]
res = [1, 2, 12, 20]
self.assertListEqual(res, natural_sorted(data))


if __name__ == "__main__":
unittest.main()
6 changes: 5 additions & 1 deletion Orange/widgets/utils/domaineditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \
TimeVariable, Domain
from Orange.data.io_util import natural_sorted
from Orange.data.util import get_unique_names_duplicates
from Orange.statistics.util import unique
from Orange.widgets import gui
Expand Down Expand Up @@ -326,7 +327,10 @@ def numbers_are_round(var, col_data):
elif tpe == type(orig_var):
var = orig_var.copy(name=new_name)
elif tpe == DiscreteVariable:
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
values = natural_sorted(
list(str(i) for i in unique(col_data)
if not self._is_missing(i))
)
round_numbers = numbers_are_round(orig_var, col_data)
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
for x in self._iter_vals(col_data)]
Expand Down

0 comments on commit 626b4d3

Please sign in to comment.