Skip to content

Commit

Permalink
Merge pull request #2923 from ales-erjavec/io-speed
Browse files Browse the repository at this point in the history
[FIX] Fix data reading speed
  • Loading branch information
lanzagar authored Apr 5, 2018
2 parents 6dbb819 + 1a0de98 commit 400eea9
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 73 deletions.
156 changes: 83 additions & 73 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import warnings

from ast import literal_eval
from collections import OrderedDict, Counter
from collections import OrderedDict, Counter, defaultdict
from functools import lru_cache
from importlib import import_module
from itertools import chain, repeat
Expand All @@ -21,7 +21,6 @@
from fnmatch import fnmatch
from glob import glob

import bottleneck as bn
import numpy as np
from chardet.universaldetector import UniversalDetector

Expand Down Expand Up @@ -114,7 +113,7 @@ def guess_data_type(orig_values):
"""
Use heuristics to guess data type.
"""
valuemap, values = [], orig_values
valuemap, values = None, orig_values
is_discrete = is_discrete_values(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
Expand All @@ -137,61 +136,50 @@ def guess_data_type(orig_values):

def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
domain_vars, existing_var, new_var_name, data=None):
def reorder_values():
new_order, old_order = \
var.values, coltype_kwargs.get('values', var.values)
if new_order != old_order:
offset = len(new_order)
column = values if data.ndim > 1 else data
column += offset
for _, val in enumerate(var.values):
try:
oldval = old_order.index(val)
except ValueError:
continue
bn.replace(column, offset + oldval, new_order.index(val))
assert issubclass(coltype, Variable)

def get_number_of_decimals():
ndecimals = max((len(value) - value.find(".")
for value in orig_values
if isinstance(value, str) and "." in value),
def get_number_of_decimals(values):
len_ = len
ndecimals = max((len_(value) - value.find(".")
for value in values if "." in value),
default=1)
return ndecimals - 1

if valuemap:
# Map discrete data to ints
def valuemap_index(val):
try:
return valuemap.index(val)
except ValueError:
return np.nan

values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
if issubclass(coltype, DiscreteVariable) and valuemap is not None:
coltype_kwargs.update(values=valuemap)

if coltype is StringVariable:
values = ['' if i is np.nan else i for i in orig_values]
if existing_var:
# Use existing variable if available
var = coltype.make(existing_var.strip(), **coltype_kwargs)
else:
# Never use existing for un-named variables
var = coltype(new_var_name, **coltype_kwargs)

if isinstance(var, DiscreteVariable):
# Map discrete data to 'ints' (or at least what passes as int around
# here)
mapping = defaultdict(
lambda: np.nan,
{val: i for i, val in enumerate(var.values)},
)
mapping[""] = np.nan
mapvalues_ = np.frompyfunc(mapping.__getitem__, 1, 1)

var = None
if domain_vars is not None:
if existing_var:
# Use existing variable if available
var = coltype.make(existing_var.strip(), **coltype_kwargs)
else:
# Never use existing for un-named variables
var = coltype(new_var_name, **coltype_kwargs)
def mapvalues(arr):
arr = np.asarray(arr, dtype=object)
return mapvalues_(arr, out=np.empty_like(arr, dtype=float))
values = mapvalues(orig_values)

if var.is_discrete and not var.ordered:
# Reorder discrete values to match existing variable
reorder_values()
if coltype is StringVariable:
values = orig_values

# ContinuousVariable.number_of_decimals is supposed to be handled by
# ContinuousVariable.to_val. In the interest of speed, the reader bypasses
# it, so we set the number of decimals here.
# The number of decimals is increased if not set manually (in which case
# var.adjust_decimals would be 0).
if isinstance(var, ContinuousVariable) and var.adjust_decimals:
ndecimals = get_number_of_decimals()
ndecimals = get_number_of_decimals(orig_values)
if var.adjust_decimals == 2 or ndecimals > var.number_of_decimals:
var.number_of_decimals = ndecimals
var.adjust_decimals = 1
Expand Down Expand Up @@ -550,8 +538,7 @@ def header_test3(items):
def data_table(cls, data, headers=None):
"""
Return Orange.data.Table given rows of `headers` (iterable of iterable)
and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
as well **have it sorted column-major**, e.g. ``order='F'``).
and rows of `data` (iterable of iterable).
Basically, the idea of subclasses is to produce those two iterables,
however they might.
Expand Down Expand Up @@ -593,8 +580,9 @@ def _equal_length(lst):
return lst

# Ensure all data is of equal width in a column-contiguous array
data = np.array([_equal_length(list(row)) for row in data if any(row)],
copy=False, dtype=object, order='F')
data = [_equal_length([s.strip() for s in row])
for row in data if any(row)]
data = np.array(data, dtype=object, order='F')

# Data may actually be longer than headers were
try:
Expand Down Expand Up @@ -622,6 +610,12 @@ def _equal_length(lst):
uses[name] += 1
names[i] = "{}_{}".format(name, uses[name])

# check only against str values
missing = {v for v in MISSING_VALUES if isinstance(v, str)}
# Note: The only speed benefit is from conversion to bool array
# by specifying the `out` parameter
isnastr = np.frompyfunc(missing.__contains__, 1, 1)
namask = np.empty(data.shape[0], dtype=bool)
# Iterate through the columns
for col in range(rowlen):
flag = Flags(Flags.split(flags[col]))
Expand All @@ -630,31 +624,32 @@ def _equal_length(lst):

type_flag = types and types[col].strip()
try:
orig_values = [np.nan if i in MISSING_VALUES else i
for i in (i.strip() for i in data[:, col])]
orig_values = data[:, col] # object array of str, NA are ""
except IndexError:
# No data instances leads here
orig_values = []
# In this case, coltype could be anything. It's set as-is
# only to satisfy test_table.TableTestCase.test_append
coltype = DiscreteVariable
orig_values = np.array([], dtype=object)
namask = isnastr(orig_values, out=namask)
orig_values[namask] = ""

coltype_kwargs = {}
valuemap = []
valuemap = None
values = orig_values

if type_flag in StringVariable.TYPE_HEADERS:
coltype = StringVariable
elif type_flag in ContinuousVariable.TYPE_HEADERS:
coltype = ContinuousVariable
values = np.empty(data.shape[0], dtype=float)
try:
values = [float(i) for i in orig_values]
np.copyto(values, orig_values, casting="unsafe",
where=~namask)
values[namask] = np.nan
except ValueError:
for row, num in enumerate(orig_values):
try:
float(num)
except ValueError:
break
if num != "":
try:
float(num)
except ValueError:
break
raise ValueError('Non-continuous value in (1-based) '
'line {}, column {}'.format(row + len(headers) + 1,
col + 1))
Expand All @@ -669,7 +664,7 @@ def _equal_length(lst):
valuemap = Flags.split(type_flag)
coltype_kwargs.update(ordered=True)
else:
valuemap = sorted(set(orig_values) - {np.nan})
valuemap = sorted(set(orig_values) - {""})

else:
# No known type specified, use heuristics
Expand All @@ -685,25 +680,30 @@ def _equal_length(lst):
append_to = (Xcols, attrs)

cols, domain_vars = append_to
cols.append(col)

existing_var, new_var_name = None, None
if domain_vars is not None:
existing_var = names and names[col]
if not existing_var:
new_var_name = next(NAMEGEN)

values, var = sanitize_variable(
valuemap, values, orig_values, coltype, coltype_kwargs,
domain_vars, existing_var, new_var_name, data)
if domain_vars is not None:
values, var = sanitize_variable(
valuemap, values, orig_values, coltype, coltype_kwargs,
domain_vars, existing_var, new_var_name, data)
else:
var = None
if domain_vars is not None:
var.attributes.update(flag.attributes)
domain_vars.append(var)

# Write back the changed data. This is needeed to pass the
# correct, converted values into Table.from_numpy below
if isinstance(values, np.ndarray) and not values.flags.owndata:
values = values.copy() # might view `data` (string columns)
cols.append(values)

try:
data[:, col] = values
# allow gc to reclaim memory used by string values
data[:, col] = None
except IndexError:
pass

Expand All @@ -712,11 +712,21 @@ def _equal_length(lst):
if not data.size:
return Table.from_domain(domain, 0)

table = Table.from_numpy(domain,
data[:, Xcols].astype(float, order='C'),
data[:, Ycols].astype(float, order='C'),
data[:, Mcols].astype(object, order='C'),
data[:, Wcols].astype(float, order='C'))
X = Y = M = W = None
if Xcols:
X = np.c_[tuple(Xcols)]
assert X.dtype == np.float_
else:
X = np.empty((data.shape[0], 0), dtype=np.float_)
if Ycols:
Y = np.c_[tuple(Ycols)]
assert Y.dtype == np.float_
if Mcols:
M = np.c_[tuple(Mcols)].astype(object)
if Wcols:
W = np.c_[tuple(Wcols)].astype(float)

table = Table.from_numpy(domain, X, Y, M, W)
return table

@staticmethod
Expand Down
11 changes: 11 additions & 0 deletions Orange/tests/test_tab_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import unittest
import tempfile
import shutil
import time
from collections import OrderedDict

import numpy as np
Expand Down Expand Up @@ -257,3 +258,13 @@ def test_number_of_decimals(self):
self.assertEqual(data.domain["RI"].number_of_decimals, 5)
self.assertEqual(data.domain["Na"].number_of_decimals, 2)
self.assertEqual(data.domain["Fe"].number_of_decimals, 2)

def test_many_discrete(self):
b = io.StringIO()
b.write("Poser\nd\n\n")
b.writelines("K" + str(i) + "\n" for i in range(30000))
start = time.time()
_ = TabReader(b).read()
elapsed = time.time() - start
if elapsed > 2:
raise AssertionError()

0 comments on commit 400eea9

Please sign in to comment.