From 178d7af77f3125c64bc07f63a65c70106c42a602 Mon Sep 17 00:00:00 2001 From: pacahon Date: Tue, 12 Jul 2016 13:06:45 +0300 Subject: [PATCH 01/13] WIP TrieKeysView --- .gitignore | 3 ++ Makefile | 6 +++ src/datrie.pyx | 80 +++++++++++++++++++++++++++++---------- tests/test_dictview.py | 85 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 19 deletions(-) create mode 100644 Makefile create mode 100644 tests/test_dictview.py diff --git a/.gitignore b/.gitignore index 6c3169f..74f1bfa 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ src/*.html *.so build/ + +.idea/ +.cache/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..527d345 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +.PHONY: build + +build: + ./update_c.sh + python setup.py build + python setup.py build_ext --inplace \ No newline at end of file diff --git a/src/datrie.pyx b/src/datrie.pyx index b55447c..01d933d 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -15,7 +15,7 @@ import itertools import warnings import sys import tempfile -from collections import MutableMapping +from collections import MutableMapping, Set try: import cPickle as pickle @@ -588,29 +588,15 @@ cdef class BaseTrie: cpdef keys(self, unicode prefix=None): """ - Returns a list of this trie's keys. + Returns dict view for trie's keys. If ``prefix`` is not None, returns only the keys prefixed by ``prefix``. """ - cdef bint success - cdef list res = [] + # FIXME: Move state initialization to BaseTrieKeysView? cdef BaseState state = BaseState(self) + cdef BaseTrieKeysView trie_keys = BaseTrieKeysView(state, prefix) - if prefix is not None: - success = state.walk(prefix) - if not success: - return res - - cdef BaseIterator iter = BaseIterator(state) - - if prefix is None: - while iter.next(): - res.append(iter.key()) - else: - while iter.next(): - res.append(prefix+iter.key()) - - return res + return trie_keys cpdef values(self, unicode prefix=None): """ @@ -980,6 +966,61 @@ cdef class Iterator(_TrieIterator): return self._root._trie._index_to_value(data) +cdef class BaseTrieKeysView: + cdef BaseState _state + cdef unicode _prefix + + def __init__(self, BaseState state, unicode prefix): + # FIXME: Create _CState and _CTrieIterator cls? + self._state = state + self._prefix = prefix + if self._prefix is not None: + self._state.walk(self._prefix) + + def __len__(self): + cdef int count = 0 + cdef BaseIterator iter = BaseIterator(self._state) + while iter.next(): + count += 1 + # Does python knows here, that it should deallocate iter objects, etc?? + return count + + def __iter__(self): + # BaseIterator additionaly implements .data() method + cdef BaseIterator _iter = BaseIterator(self._state) + while _iter.next(): + if self._prefix is None: + yield _iter.key() + else: + yield self._prefix + _iter.key() + + def __contains__(self, item): + # FIXME: get max prefix first + for key in self: + if key == item: + return True + return False + + def __richcmp__(self, other, int op): + if op == 2: # == + if other is self: + return True + elif not isinstance(other, Set): + return False + # FIXME: problems with ordering here + for key in self: + if self[key] != other[key]: + return False + + # XXX this can be written more efficiently via explicit iterators. + return len(self) == len(other) + elif op == 3: # != + return not (self == other) + + raise TypeError("unorderable types: {0} and {1}".format( + self.__class__, other.__class__)) + + cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: cdef int fd = f.fileno() cdef stdio.FILE* f_ptr = stdio_ext.fdopen(fd, "r") @@ -1145,3 +1186,4 @@ def new(alphabet=None, ranges=None, AlphaMap alpha_map=None): MutableMapping.register(Trie) MutableMapping.register(BaseTrie) +Set.register(BaseTrieKeysView) diff --git a/tests/test_dictview.py b/tests/test_dictview.py new file mode 100644 index 0000000..534c585 --- /dev/null +++ b/tests/test_dictview.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import, unicode_literals + +import string +import datrie + + +def test_keys_empty(): + trie = datrie.BaseTrie(string.printable) + keys = trie.keys() + assert len(keys) == 0 + + +def test_keys_iter(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys_list = list(trie.keys()) + keys_list.sort() + assert keys_list == ["1", "2"] + + +def test_keys_iter_with_prefix(): + trie = datrie.BaseTrie(string.printable) + trie["prefix1_1"] = 11 + trie["prefix1_2"] = 12 + trie["prefix2_1"] = 21 + trie["prefix2_2"] = 22 + keys = trie.keys(prefix="prefix1") + keys_list = list(keys) + keys_list.sort() + assert keys_list == ["prefix1_1", "prefix1_2"] + + +def test_keys_contains(): + trie = datrie.BaseTrie(string.printable) + trie["prefix1_1"] = 11 + trie["prefix1_2"] = 12 + trie["prefix2_1"] = 21 + trie["prefix2_2"] = 22 + keys = trie.keys() + assert "prefix1_1" in keys + assert "prefix2_1" in keys + keys = trie.keys(prefix="prefix1") + assert "prefix1_1" in keys + assert "prefix2_1" not in keys + + +def test_keys_len(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + keys = trie.keys() + assert len(trie) == 1 + assert len(keys) == 1 + trie["1"] = 2 + trie["2"] = 2 + assert len(keys) == 2 + trie["prefix_3"] = 3 + keys = trie.keys(prefix="prefix") + assert len(keys) == 1 + + +def test_keys_prefix(): + trie = datrie.BaseTrie(string.printable) + trie["prefix1_1"] = 11 + trie["prefix1_2"] = 12 + trie["prefix2_3"] = 21 + keys = trie.keys(prefix="prefix") + assert len(keys) == 3 + keys = trie.keys(prefix="prefix1_") + assert len(keys) == 2 + keys_list = list(keys) + keys_list.sort() + assert keys_list == ["prefix1_1", "prefix1_2"] + + +def test_keys_delete(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys = trie.keys() + del trie["1"] + assert len(trie) == 1 + assert len(keys) == 1 From 9a27adf101ffd95c0089c70d4b6fdd78e1cbe90a Mon Sep 17 00:00:00 2001 From: pacahon Date: Thu, 14 Jul 2016 00:46:08 +0300 Subject: [PATCH 02/13] WIP rewrite BaseTrieKeysView --- src/datrie.pyx | 76 ++++++++++++++++++++++++------------------ tests/test_dictview.py | 23 +++++++++++-- 2 files changed, 63 insertions(+), 36 deletions(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index 01d933d..c13e20e 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -971,54 +971,64 @@ cdef class BaseTrieKeysView: cdef unicode _prefix def __init__(self, BaseState state, unicode prefix): - # FIXME: Create _CState and _CTrieIterator cls? self._state = state self._prefix = prefix - if self._prefix is not None: - self._state.walk(self._prefix) + + # FIXME: Not clear understanding when I should use cpdef/def/cdef + cpdef _rewind_state(self, unicode new_state): + """ + Reset state to root. Then if `new_state` is not None, try to walk + to new state. + """ + self._state.rewind() + if new_state is not None: + if not self._state.walk(new_state): + return False + return True def __len__(self): cdef int count = 0 - cdef BaseIterator iter = BaseIterator(self._state) - while iter.next(): - count += 1 - # Does python knows here, that it should deallocate iter objects, etc?? + cdef _TrieIterator it + if self._rewind_state(self._prefix): + it = _TrieIterator(self._state) + while it.next(): + count += 1 return count def __iter__(self): - # BaseIterator additionaly implements .data() method - cdef BaseIterator _iter = BaseIterator(self._state) - while _iter.next(): + if not self._rewind_state(self._prefix): + raise StopIteration + cdef _TrieIterator it = _TrieIterator(self._state) + while it.next(): if self._prefix is None: - yield _iter.key() + yield it.key() else: - yield self._prefix + _iter.key() + yield self._prefix + it.key() def __contains__(self, item): - # FIXME: get max prefix first - for key in self: - if key == item: - return True + # Should I use in cython more explicit condition check like `is not None`? + if self._prefix is not None and not item.startswith(self._prefix): + return False + if self._rewind_state(item) and self._state.is_terminal(): + return True return False def __richcmp__(self, other, int op): - if op == 2: # == - if other is self: - return True - elif not isinstance(other, Set): - return False - # FIXME: problems with ordering here - for key in self: - if self[key] != other[key]: - return False - - # XXX this can be written more efficiently via explicit iterators. - return len(self) == len(other) - elif op == 3: # != - return not (self == other) - - raise TypeError("unorderable types: {0} and {1}".format( - self.__class__, other.__class__)) + return NotImplemented + # if op == 2: # == + # if other is self: + # return True + # elif not isinstance(other, Set): + # return False + # # FIXME: problems with ordering here + # for key in self: + # if self[key] != other[key]: + # return False + # + # # XXX this can be written more efficiently via explicit iterators. + # return len(self) == len(other) + # elif op == 3: # != + # return not (self == other) cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: diff --git a/tests/test_dictview.py b/tests/test_dictview.py index 534c585..3d6dabe 100644 --- a/tests/test_dictview.py +++ b/tests/test_dictview.py @@ -12,25 +12,34 @@ def test_keys_empty(): assert len(keys) == 0 +# TODO: Can I use py.test fixtures here? def test_keys_iter(): trie = datrie.BaseTrie(string.printable) trie["1"] = 1 trie["2"] = 2 - keys_list = list(trie.keys()) + keys = trie.keys() + keys_list = list(keys) keys_list.sort() assert keys_list == ["1", "2"] + del trie["2"] + assert list(keys) == ["1"] def test_keys_iter_with_prefix(): trie = datrie.BaseTrie(string.printable) + keys = trie.keys(prefix="prefix1") + keys_list = list(keys) + assert keys_list == [] trie["prefix1_1"] = 11 trie["prefix1_2"] = 12 trie["prefix2_1"] = 21 trie["prefix2_2"] = 22 - keys = trie.keys(prefix="prefix1") keys_list = list(keys) keys_list.sort() assert keys_list == ["prefix1_1", "prefix1_2"] + del trie["prefix1_1"] + del trie["prefix1_2"] + assert list(keys) == [] def test_keys_contains(): @@ -55,10 +64,15 @@ def test_keys_len(): assert len(keys) == 1 trie["1"] = 2 trie["2"] = 2 - assert len(keys) == 2 trie["prefix_3"] = 3 + assert len(keys) == 3 keys = trie.keys(prefix="prefix") assert len(keys) == 1 + del trie["1"] + del trie["2"] + assert len(keys) == 1 + del trie["prefix_3"] + assert len(keys) == 0 def test_keys_prefix(): @@ -73,6 +87,9 @@ def test_keys_prefix(): keys_list = list(keys) keys_list.sort() assert keys_list == ["prefix1_1", "prefix1_2"] + del trie["prefix1_1"] + del trie["prefix2_3"] + assert list(keys) == ["prefix1_2"] def test_keys_delete(): From 1a06787bac744f39a78adb6597c82049e62d75fc Mon Sep 17 00:00:00 2001 From: pacahon Date: Thu, 21 Jul 2016 01:22:23 +0300 Subject: [PATCH 03/13] dtrie .keys() Set support WIP --- src/datrie.pyx | 59 +++++++++++++++++++++++++++++++----------- tests/test_dictview.py | 57 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 15 deletions(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index c13e20e..fd46be9 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -1014,21 +1014,50 @@ cdef class BaseTrieKeysView: return False def __richcmp__(self, other, int op): - return NotImplemented - # if op == 2: # == - # if other is self: - # return True - # elif not isinstance(other, Set): - # return False - # # FIXME: problems with ordering here - # for key in self: - # if self[key] != other[key]: - # return False - # - # # XXX this can be written more efficiently via explicit iterators. - # return len(self) == len(other) - # elif op == 3: # != - # return not (self == other) + if op == 0: # < + # FIXME: looks like not necessary to implement + return NotImplemented + elif op == 1: # <= + # s.issubset(t) - test whether every element in s is in t + if other is self: + return True + try: + for key in self: + if key not in other: + return False + return True + except TypeError: + return False + elif op == 2: # == + if other is self: + return True + elif not isinstance(other, Set): + return False + # Should iterate over self due to `prefix` argument in .keys() + # even if Set.__contains__ more efficient (not sure at all why it should be) + count = 0 + for key in self: + count += 1 + if key not in other: + return False + + return count == len(other) + elif op == 3: # != + return not (self == other) + elif op == 4: # > + # FIXME: looks like not necessary to implement + return NotImplemented + elif op == 5: # >= + # s.issuperset(t) - test whether every element in t is in s + if other is self: + return True + try: + for key in other: + if key not in self: + return False + return True + except TypeError: + return False cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: diff --git a/tests/test_dictview.py b/tests/test_dictview.py index 3d6dabe..f3ffc21 100644 --- a/tests/test_dictview.py +++ b/tests/test_dictview.py @@ -100,3 +100,60 @@ def test_keys_delete(): del trie["1"] assert len(trie) == 1 assert len(keys) == 1 + + +def test_keys_eq(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys = trie.keys() + assert keys == {"1", "2"} + assert keys == {"2", "1"} + trie["3"] = 3 + assert keys != {"2", "1"} + del trie["1"] + assert keys == {"2", "3"} + trie["prefix_4"] = 4 + keys = trie.keys(prefix="prefix") + assert keys == {"prefix_4"} + assert keys != {"1", "2", "3"} + + +def test_keys_issuperset(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys = trie.keys() + assert keys >= {"1"} + assert not keys >= {1} + assert keys >= {"2"} + assert keys >= {"1", "2"} + assert not keys >= {"1", "2", "3"} + assert not keys >= {"3"} + assert not keys >= 4 # not iterable + trie["prefix_3"] = 3 + keys = trie.keys(prefix="prefix") + assert keys >= {"prefix_3"} + assert not keys >= {"prefix_3", "1"} + del trie["prefix_3"] + assert keys >= set() + + +def test_keys_issubset(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys = trie.keys() + assert not keys <= {"1"} + assert not keys <= 1 # not iterable + assert keys <= {"1", "2"} + assert keys <= ["1", "2"] + assert keys <= {"1", "2", "3"} + trie["prefix_3"] = 3 + keys = trie.keys(prefix="prefix") + assert keys <= {"prefix_3"} + assert keys <= {"prefix_3", "1"} + assert not keys <= {"1", "2"} + del trie["prefix_3"] + assert keys <= {"prefix_3"} + assert keys <= set() From 795a954fa653137a7bc51b3f9ffab82856e2f4fb Mon Sep 17 00:00:00 2001 From: pacahon Date: Fri, 19 Aug 2016 01:06:04 +0300 Subject: [PATCH 04/13] WIP Trie KeysView --- src/datrie.pyx | 101 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 34 deletions(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index fd46be9..548eea5 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -15,7 +15,7 @@ import itertools import warnings import sys import tempfile -from collections import MutableMapping, Set +from collections import MutableMapping, Set, KeysView try: import cPickle as pickle @@ -592,7 +592,6 @@ cdef class BaseTrie: If ``prefix`` is not None, returns only the keys prefixed by ``prefix``. """ - # FIXME: Move state initialization to BaseTrieKeysView? cdef BaseState state = BaseState(self) cdef BaseTrieKeysView trie_keys = BaseTrieKeysView(state, prefix) @@ -974,8 +973,7 @@ cdef class BaseTrieKeysView: self._state = state self._prefix = prefix - # FIXME: Not clear understanding when I should use cpdef/def/cdef - cpdef _rewind_state(self, unicode new_state): + cdef int _rewind_state(self, unicode new_state): """ Reset state to root. Then if `new_state` is not None, try to walk to new state. @@ -1006,35 +1004,32 @@ cdef class BaseTrieKeysView: yield self._prefix + it.key() def __contains__(self, item): - # Should I use in cython more explicit condition check like `is not None`? - if self._prefix is not None and not item.startswith(self._prefix): + if self._prefix and not item.startswith(self._prefix): return False if self._rewind_state(item) and self._state.is_terminal(): return True return False def __richcmp__(self, other, int op): - if op == 0: # < - # FIXME: looks like not necessary to implement - return NotImplemented - elif op == 1: # <= + if op == 0: # < or __lt__ + # Test whether the set is a proper subset of other, that is, + # `set <= other and set != other`. + # FIXME: iterate over `self` and then check len(self) != len(other) + # FIXME: remove issubset method, not really needs it + return self.issubset(other) and self != other + elif op == 1: # <= or __le__ # s.issubset(t) - test whether every element in s is in t - if other is self: - return True - try: - for key in self: - if key not in other: - return False - return True - except TypeError: - return False + # FIXME: duplicate this exception? really? + if not isinstance(other, Set): + raise TypeError("unorderable types: dict_keys() <= {}()".format( + type(other))) + return self.issubset(other) elif op == 2: # == if other is self: return True elif not isinstance(other, Set): return False # Should iterate over self due to `prefix` argument in .keys() - # even if Set.__contains__ more efficient (not sure at all why it should be) count = 0 for key in self: count += 1 @@ -1044,20 +1039,58 @@ cdef class BaseTrieKeysView: return count == len(other) elif op == 3: # != return not (self == other) - elif op == 4: # > - # FIXME: looks like not necessary to implement - return NotImplemented - elif op == 5: # >= + elif op == 4: # > or __gt__ + # set > other - test whether the `set` is a proper superset + # of `other`, that is, set >= other and set != other. + return self.issuperset(other) and other != self + elif op == 5: # >= or __ge__ in cython # s.issuperset(t) - test whether every element in t is in s - if other is self: - return True - try: - for key in other: - if key not in self: - return False - return True - except TypeError: - return False + if not isinstance(other, Set): + raise TypeError("unorderable types: dict_keys() >= {}()".format( + type(other))) + return self.issuperset(other) + + cdef int issubset(self, other): + if other is self: + return True + try: + for key in self: + if key not in other: + return False + return True + except TypeError: + return False + + cdef int issuperset(self, other): + if other is self: + return True + try: + for key in other: + if key not in self: + return False + return True + except TypeError: + return False + + def __and__(self, other): + # intersection + return NotImplemented + + def __or__(self, other): + # union + return NotImplemented + + def __sub__(self, other): + # difference + return NotImplemented + + def __xor__(self, other): + # symmetric_difference, set ^ other + # Return a new set with elements in either the set or other but not both. + return NotImplemented + + def isdisjoint(self, other): + return NotImplemented cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: @@ -1225,4 +1258,4 @@ def new(alphabet=None, ranges=None, AlphaMap alpha_map=None): MutableMapping.register(Trie) MutableMapping.register(BaseTrie) -Set.register(BaseTrieKeysView) +KeysView.register(BaseTrieKeysView) From 0ca211337107dc22e16bb040c51a3edc7fdc093d Mon Sep 17 00:00:00 2001 From: pacahon Date: Tue, 23 Aug 2016 00:07:23 +0300 Subject: [PATCH 05/13] WIP KeysView: more set methods --- src/datrie.pyx | 101 +++++++++++++++++++++++++---------------- tests/test_dictview.py | 28 ++++++++++-- 2 files changed, 86 insertions(+), 43 deletions(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index 548eea5..71bbbac 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -964,7 +964,7 @@ cdef class Iterator(_TrieIterator): cdef cdatrie.TrieData data = cdatrie.trie_iterator_get_data(self._iter) return self._root._trie._index_to_value(data) - +# TODO: Register or inherit from KeysView?! cdef class BaseTrieKeysView: cdef BaseState _state cdef unicode _prefix @@ -1014,70 +1014,85 @@ cdef class BaseTrieKeysView: if op == 0: # < or __lt__ # Test whether the set is a proper subset of other, that is, # `set <= other and set != other`. - # FIXME: iterate over `self` and then check len(self) != len(other) - # FIXME: remove issubset method, not really needs it - return self.issubset(other) and self != other + if not isinstance(other, Set): + raise TypeError("unorderable types: dict_keys() < %s()" % type(other)) + if other is self: + return False + # FIXME: cdef int count at the beginning of func or override `op` maybe? + count = 0 + for key in self: + count += 1 + if key not in other: + return False + return count != len(other) elif op == 1: # <= or __le__ # s.issubset(t) - test whether every element in s is in t - # FIXME: duplicate this exception? really? if not isinstance(other, Set): - raise TypeError("unorderable types: dict_keys() <= {}()".format( - type(other))) - return self.issubset(other) + raise TypeError("unorderable types: dict_keys() <= %s()" % type(other)) + if other is self: + return True + for key in self: + if key not in other: + return False + return True elif op == 2: # == if other is self: return True elif not isinstance(other, Set): + # No TypeError for equality return False - # Should iterate over self due to `prefix` argument in .keys() count = 0 for key in self: count += 1 if key not in other: return False - return count == len(other) elif op == 3: # != return not (self == other) elif op == 4: # > or __gt__ # set > other - test whether the `set` is a proper superset # of `other`, that is, set >= other and set != other. - return self.issuperset(other) and other != self - elif op == 5: # >= or __ge__ in cython + if not isinstance(other, Set): + raise TypeError("unorderable types: dict_keys() > %s()" % type(other)) + if other is self: + return False + try: + for key in other: + if key not in self: + return False + except TypeError: + return False + # FIXME: len(self) is O(n) + return len(other) != len(self) + elif op == 5: # >= or __ge__ # s.issuperset(t) - test whether every element in t is in s if not isinstance(other, Set): - raise TypeError("unorderable types: dict_keys() >= {}()".format( - type(other))) - return self.issuperset(other) - - cdef int issubset(self, other): - if other is self: - return True - try: - for key in self: - if key not in other: - return False + raise TypeError("unorderable types: dict_keys() >= %s()" % type(other)) + if other is self: + return True + try: + for key in other: + if key not in self: + return False + except TypeError: + return False return True - except TypeError: - return False - cdef int issuperset(self, other): + def __and__(self, other): # intersection + """Return a new set with elements common to dict_view and `other`.""" if other is self: - return True + return set(self) + # Looks like operator's version of intersection accepts any iterable + # for dict_view try: - for key in other: - if key not in self: - return False - return True + return {key for key in self if key in other} except TypeError: - return False - - def __and__(self, other): - # intersection - return NotImplemented + raise TypeError("'%s' object is not iterable" % type(other)) - def __or__(self, other): - # union + def __or__(self, other): # union + if other is self: + return set(self) + # TODO: maybe convert self to set and use native method? return NotImplemented def __sub__(self, other): @@ -1090,7 +1105,15 @@ cdef class BaseTrieKeysView: return NotImplemented def isdisjoint(self, other): - return NotImplemented + """ + Return True if the set has no elements in common with `other`. + Sets are disjoint if and only if their intersection is the empty set. + """ + if other is self: + return False + if any(True for key in self if key in other): + return False + return True cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: diff --git a/tests/test_dictview.py b/tests/test_dictview.py index f3ffc21..1d44c76 100644 --- a/tests/test_dictview.py +++ b/tests/test_dictview.py @@ -2,7 +2,9 @@ from __future__ import absolute_import, unicode_literals +import pytest import string + import datrie @@ -125,12 +127,14 @@ def test_keys_issuperset(): trie["2"] = 2 keys = trie.keys() assert keys >= {"1"} - assert not keys >= {1} + with pytest.raises(TypeError): + _ = keys >= 1 # not iterable assert keys >= {"2"} assert keys >= {"1", "2"} assert not keys >= {"1", "2", "3"} assert not keys >= {"3"} - assert not keys >= 4 # not iterable + # Wrong type inside set + assert not keys >= {1, 2} trie["prefix_3"] = 3 keys = trie.keys(prefix="prefix") assert keys >= {"prefix_3"} @@ -145,9 +149,10 @@ def test_keys_issubset(): trie["2"] = 2 keys = trie.keys() assert not keys <= {"1"} - assert not keys <= 1 # not iterable + with pytest.raises(TypeError): + assert not keys <= 1 # not iterable + assert keys <= ["1", "2"] # wrong type assert keys <= {"1", "2"} - assert keys <= ["1", "2"] assert keys <= {"1", "2", "3"} trie["prefix_3"] = 3 keys = trie.keys(prefix="prefix") @@ -157,3 +162,18 @@ def test_keys_issubset(): del trie["prefix_3"] assert keys <= {"prefix_3"} assert keys <= set() + + +def test_keys_intersection(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys = trie.keys() + assert (keys & keys) == set("12") + assert (keys & keys) != set() + assert (keys & keys) != set("1") + assert (keys & keys) != set("2") + assert (keys & '1') == set("1") + with pytest.raises(TypeError): + assert (keys & 1) == set("1") # not iterable + assert (keys & 'ab') == set() From ca83b2769522cbd920e44762d27510c41893ca97 Mon Sep 17 00:00:00 2001 From: pacahon Date: Mon, 29 Aug 2016 00:24:34 +0300 Subject: [PATCH 06/13] WIP add BaseTrieItemsView and BaseTrieValuesView; more tests on BaseTrieKeysView --- src/datrie.pyx | 256 ++++++++++++++++++++++++----------------- tests/test_dictview.py | 111 +++++++++++++++++- tests/test_trie.py | 18 +-- 3 files changed, 266 insertions(+), 119 deletions(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index 71bbbac..ff38233 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -3,6 +3,7 @@ Cython wrapper for libdatrie. """ +from cpython cimport bool from cpython.version cimport PY_MAJOR_VERSION from cython.operator import dereference as deref from libc.stdlib cimport malloc, free @@ -15,7 +16,8 @@ import itertools import warnings import sys import tempfile -from collections import MutableMapping, Set, KeysView +from collections import MutableMapping, Set, Iterable, KeysView, \ + ValuesView, ItemsView try: import cPickle as pickle @@ -554,69 +556,36 @@ cdef class BaseTrie: finally: cdatrie.trie_state_free(state) + def __iter__(self): + cdef BaseIterator iter = BaseIterator(BaseState(self)) + while iter.next(): + yield iter.key() + cpdef items(self, unicode prefix=None): """ - Returns a list of this trie's items (``(key,value)`` tuples). + D.items() -> a set-like object providing a view on D's items. If ``prefix`` is not None, returns only the items associated with keys prefixed by ``prefix``. """ - cdef bint success - cdef list res = [] - cdef BaseState state = BaseState(self) - - if prefix is not None: - success = state.walk(prefix) - if not success: - return res - - cdef BaseIterator iter = BaseIterator(state) - - if prefix is None: - while iter.next(): - res.append((iter.key(), iter.data())) - else: - while iter.next(): - res.append((prefix+iter.key(), iter.data())) - - return res - - def __iter__(self): - cdef BaseIterator iter = BaseIterator(BaseState(self)) - while iter.next(): - yield iter.key() + return BaseTrieItemsView(self, prefix) cpdef keys(self, unicode prefix=None): """ - Returns dict view for trie's keys. + D.keys() -> a set-like object providing a view on D's keys. If ``prefix`` is not None, returns only the keys prefixed by ``prefix``. """ - cdef BaseState state = BaseState(self) - cdef BaseTrieKeysView trie_keys = BaseTrieKeysView(state, prefix) - - return trie_keys + return BaseTrieKeysView(self, prefix) cpdef values(self, unicode prefix=None): """ - Returns a list of this trie's values. + D.values() -> an object providing a view on D's values If ``prefix`` is not None, returns only the values associated with keys prefixed by ``prefix``. """ - cdef bint success - cdef list res = [] - cdef BaseState state = BaseState(self) - - if prefix is not None: - success = state.walk(prefix) - if not success: - return res - - cdef BaseIterator iter = BaseIterator(state) - while iter.next(): - res.append(iter.data()) - return res + return BaseTrieValuesView(self, prefix) cdef _index_to_value(self, cdatrie.TrieData index): return index @@ -964,12 +933,13 @@ cdef class Iterator(_TrieIterator): cdef cdatrie.TrieData data = cdatrie.trie_iterator_get_data(self._iter) return self._root._trie._index_to_value(data) -# TODO: Register or inherit from KeysView?! + cdef class BaseTrieKeysView: cdef BaseState _state cdef unicode _prefix - def __init__(self, BaseState state, unicode prefix): + def __init__(self, BaseTrie base_trie, unicode prefix): + cdef BaseState state = BaseState(base_trie) self._state = state self._prefix = prefix @@ -985,6 +955,7 @@ cdef class BaseTrieKeysView: return True def __len__(self): + """O(n) in current implementation""" cdef int count = 0 cdef _TrieIterator it if self._rewind_state(self._prefix): @@ -1015,26 +986,13 @@ cdef class BaseTrieKeysView: # Test whether the set is a proper subset of other, that is, # `set <= other and set != other`. if not isinstance(other, Set): - raise TypeError("unorderable types: dict_keys() < %s()" % type(other)) - if other is self: - return False - # FIXME: cdef int count at the beginning of func or override `op` maybe? - count = 0 - for key in self: - count += 1 - if key not in other: - return False - return count != len(other) + return NotImplemented + return len(self) < len(other) and self._issubset(other) elif op == 1: # <= or __le__ # s.issubset(t) - test whether every element in s is in t if not isinstance(other, Set): - raise TypeError("unorderable types: dict_keys() <= %s()" % type(other)) - if other is self: - return True - for key in self: - if key not in other: - return False - return True + return NotImplemented + return self._issubset(other) elif op == 2: # == if other is self: return True @@ -1042,79 +1000,165 @@ cdef class BaseTrieKeysView: # No TypeError for equality return False count = 0 - for key in self: + for elem in self: count += 1 - if key not in other: + if elem not in other: return False return count == len(other) + # return len(self) == len(other) and self._issubset(other) elif op == 3: # != return not (self == other) elif op == 4: # > or __gt__ # set > other - test whether the `set` is a proper superset # of `other`, that is, set >= other and set != other. if not isinstance(other, Set): - raise TypeError("unorderable types: dict_keys() > %s()" % type(other)) - if other is self: - return False - try: - for key in other: - if key not in self: - return False - except TypeError: - return False - # FIXME: len(self) is O(n) - return len(other) != len(self) + return NotImplemented + return len(self) > len(other) and self._issuperset(other) elif op == 5: # >= or __ge__ # s.issuperset(t) - test whether every element in t is in s if not isinstance(other, Set): - raise TypeError("unorderable types: dict_keys() >= %s()" % type(other)) - if other is self: - return True - try: - for key in other: - if key not in self: - return False - except TypeError: + return NotImplemented + return self._issuperset(other) + + cpdef bool _issubset(self, other): + """s._issubset(t) - test whether every element in s is in t""" + if len(self) > len(other): + return False + for elem in self: + if elem not in other: return False - return True + return True + + cpdef bool _issuperset(self, other): + """s._issuperset(t) - test whether every element in t is in s""" + if len(self) < len(other): + return False + try: + for elem in other: + if elem not in self: + return False + except TypeError: + return False + return True - def __and__(self, other): # intersection + # Note: For KeysView explicitly used set in `_from_iterable` method + def __and__(self, other): # intersection, & """Return a new set with elements common to dict_view and `other`.""" if other is self: return set(self) - # Looks like operator's version of intersection accepts any iterable - # for dict_view - try: - return {key for key in self if key in other} - except TypeError: - raise TypeError("'%s' object is not iterable" % type(other)) + elif not isinstance(other, Iterable): + return NotImplemented + return {key for key in self if key in other} - def __or__(self, other): # union + def __or__(self, other): # union, | if other is self: return set(self) - # TODO: maybe convert self to set and use native method? - return NotImplemented + elif not isinstance(other, Iterable): + return NotImplemented + return {e for e in itertools.chain(self, other)} - def __sub__(self, other): - # difference - return NotImplemented + def __sub__(self, other): # difference, - + if not isinstance(other, Set): + if not isinstance(other, Iterable): + return NotImplemented + other = set(other) + return set(value for value in self if value not in other) def __xor__(self, other): # symmetric_difference, set ^ other # Return a new set with elements in either the set or other but not both. - return NotImplemented + if not isinstance(other, Set): + if not isinstance(other, Iterable): + return NotImplemented + other = set(other) + return (self - other) | (other - self) def isdisjoint(self, other): - """ - Return True if the set has no elements in common with `other`. - Sets are disjoint if and only if their intersection is the empty set. - """ + """Return True if keys view and `other` have a null intersection.""" if other is self: return False - if any(True for key in self if key in other): + for value in other: + if value in self: + return False + return True + + +class BaseTrieItemsView(BaseTrieKeysView): + + __slots__ = () + + def __iter__(self): + if not self._rewind_state(self._prefix): + raise StopIteration + cdef BaseIterator it = BaseIterator(self._state) + while it.next(): + if self._prefix is None: + yield (it.key(), it.data()) + else: + yield (self._prefix + it.key(), it.data()) + + def __contains__(self, item): + key, value = item + if self._prefix and not key.startswith(self._prefix): return False + if self._rewind_state(key) and self._state.is_terminal(): + v = self._state.data() + return v is value or v == value + return False + + +# FIXME: copy paste from BaseTrieKeysView in most?! +cdef class BaseTrieValuesView: + cdef BaseState _state + cdef unicode _prefix + + def __init__(self, BaseTrie base_trie, unicode prefix): + cdef BaseState state = BaseState(base_trie) + self._state = state + self._prefix = prefix + + cdef int _rewind_state(self, unicode new_state): + """ + Reset state to root. Then if `new_state` is not None, try to walk + to new state. + """ + self._state.rewind() + if new_state is not None: + if not self._state.walk(new_state): + return False return True + def __len__(self): + """O(n) in current implementation""" + cdef int count = 0 + cdef _TrieIterator it + if self._rewind_state(self._prefix): + it = _TrieIterator(self._state) + while it.next(): + count += 1 + return count + + def __iter__(self): + if not self._rewind_state(self._prefix): + raise StopIteration + cdef BaseIterator it = BaseIterator(self._state) + while it.next(): + yield it.data() + + def __contains__(self, value): + if self._prefix and not value.startswith(self._prefix): + return False + for v in self: + if v is value or v == value: + return True + return False + +# FIXME: or add self.trie to BaseTrieKeysView? +class TrieKeysView(BaseTrieKeysView): + def __init__(self, BaseTrie base_trie, unicode prefix): + self.trie = base_trie + super(TrieKeysView, self).__init__(base_trie, prefix) + cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: cdef int fd = f.fileno() @@ -1282,3 +1326,5 @@ def new(alphabet=None, ranges=None, AlphaMap alpha_map=None): MutableMapping.register(Trie) MutableMapping.register(BaseTrie) KeysView.register(BaseTrieKeysView) +ItemsView.register(BaseTrieItemsView) +ValuesView.register(BaseTrieValuesView) diff --git a/tests/test_dictview.py b/tests/test_dictview.py index 1d44c76..8e30c59 100644 --- a/tests/test_dictview.py +++ b/tests/test_dictview.py @@ -105,6 +105,7 @@ def test_keys_delete(): def test_keys_eq(): + """Test trie.keys() == and != operations""" trie = datrie.BaseTrie(string.printable) trie["1"] = 1 trie["2"] = 2 @@ -118,10 +119,12 @@ def test_keys_eq(): trie["prefix_4"] = 4 keys = trie.keys(prefix="prefix") assert keys == {"prefix_4"} + assert not keys != {"prefix_4"} assert keys != {"1", "2", "3"} def test_keys_issuperset(): + """Test trie.keys() >= and > operations""" trie = datrie.BaseTrie(string.printable) trie["1"] = 1 trie["2"] = 2 @@ -133,6 +136,10 @@ def test_keys_issuperset(): assert keys >= {"1", "2"} assert not keys >= {"1", "2", "3"} assert not keys >= {"3"} + # Proper superset + assert keys > {"2"} + assert not keys > {"1", "2"} + assert not keys > {"3"} # Wrong type inside set assert not keys >= {1, 2} trie["prefix_3"] = 3 @@ -144,6 +151,7 @@ def test_keys_issuperset(): def test_keys_issubset(): + """Test trie.keys() <= and < operations""" trie = datrie.BaseTrie(string.printable) trie["1"] = 1 trie["2"] = 2 @@ -155,6 +163,10 @@ def test_keys_issubset(): assert keys <= {"1", "2"} assert keys <= {"1", "2", "3"} trie["prefix_3"] = 3 + # Proper subset + assert not keys < {"1", "2"} + assert not keys < {"1", "2", "prefix_3"} + assert keys < {"1", "2", "prefix_3", "3"} keys = trie.keys(prefix="prefix") assert keys <= {"prefix_3"} assert keys <= {"prefix_3", "1"} @@ -162,6 +174,7 @@ def test_keys_issubset(): del trie["prefix_3"] assert keys <= {"prefix_3"} assert keys <= set() + assert keys < {"1", "2", "3"} def test_keys_intersection(): @@ -169,11 +182,99 @@ def test_keys_intersection(): trie["1"] = 1 trie["2"] = 2 keys = trie.keys() - assert (keys & keys) == set("12") + assert (keys & keys) == {"1", "2"} assert (keys & keys) != set() - assert (keys & keys) != set("1") - assert (keys & keys) != set("2") - assert (keys & '1') == set("1") + assert (keys & keys) != {"1"} + assert (keys & keys) != {"2"} + assert (keys & '1') == {"1"} with pytest.raises(TypeError): - assert (keys & 1) == set("1") # not iterable + assert (keys & 1) == {"1"} # not iterable assert (keys & 'ab') == set() + assert (keys & "12") == {"1", "2"} + assert (keys & "1") == {"1"} + trie["prefix_3"] = 3 + keys = trie.keys(prefix="prefix_") + assert (keys & keys) == {"prefix_3"} + assert (keys & keys) == keys + assert (keys & "12") == set() + assert (keys & "") == set() + + +def test_keys_union(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + trie["333"] = 2 + keys = trie.keys() + assert (keys | keys) == keys + assert (keys | set()) == set(keys) + del trie["333"] + assert (keys | {"1"}) == {"1", "2"} + del trie["1"] + assert (keys | {"1"}) == {"1", "2"} + assert (keys | {"2"}) == {"2"} + assert (keys | {"3"}) == {"2", "3"} + keys = trie.keys(prefix="") + assert (keys | {"3"}) == {"2", "3"} + keys = trie.keys(prefix="prefix") + assert (keys | {"3"}) == {"3"} + trie["prefix_3"] = 3 + assert (keys | {"3"}) == {"3", "prefix_3"} + + +def test_keys_difference(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + trie["3"] = 2 + keys = trie.keys() + assert (keys - set()) == set(keys) + assert (keys - {"3"}) == {"1", "2"} + assert (keys - {"2", "3"}) == {"1"} + assert (keys - {"1", "2", "3"}) == set() + assert (keys - {"1", "2", "3", "4"}) == set() + assert (keys - {"4"}) == {"1", "2", "3"} + keys = trie.keys(prefix="prefix") + assert (keys - set()) == set() + assert (keys - {"1"}) == set() + trie["prefix_1"] = 3 + assert (keys - set()) == {"prefix_1"} + assert (keys - {"prefix_1"}) == set() + assert (keys - {"prefix_2"}) == {"prefix_1"} + + +def test_keys_symmetric_difference(): + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys = trie.keys() + assert (keys ^ set()) == {"1", "2"} + assert (keys ^ {"1"}) == {"2"} + assert (keys ^ {"1", "2"}) == set() + assert (keys ^ {"1", "2", "3"}) == {"3"} + del trie["1"] + assert (keys ^ {"1"}) == {"1", "2"} + keys = trie.keys(prefix="prefix") + assert (keys ^ {"1"}) == {"1"} + trie["prefix_1"] = 3 + assert (keys ^ {"1"}) == {"prefix_1", "1"} + + +def test_keys_isdisjoint(): + # Return True if null intersection + trie = datrie.BaseTrie(string.printable) + trie["1"] = 1 + trie["2"] = 2 + keys = trie.keys() + assert keys.isdisjoint(set()) + assert not keys.isdisjoint({"1"}) + assert keys.isdisjoint({"3"}) + del trie["1"] + assert keys.isdisjoint({"1"}) + keys = trie.keys(prefix="prefix") + assert keys.isdisjoint({"1"}) + assert keys.isdisjoint({"2"}) + trie["prefix_1"] = 3 + assert keys.isdisjoint({"2"}) + assert not keys.isdisjoint({"prefix_1"}) + assert keys.isdisjoint({"prefix_2"}) diff --git a/tests/test_trie.py b/tests/test_trie.py index 0b3f039..ec553ce 100644 --- a/tests/test_trie.py +++ b/tests/test_trie.py @@ -141,9 +141,9 @@ def test_trie_items(): trie['foo'] = 10 trie['bar'] = 'foo' trie['foobar'] = 30 - assert trie.values() == ['foo', 10, 30] - assert trie.items() == [('bar', 'foo'), ('foo', 10), ('foobar', 30)] - assert trie.keys() == ['bar', 'foo', 'foobar'] + assert list(trie.values()) == ['foo', 10, 30] + assert list(trie.items()) == [('bar', 'foo'), ('foo', 10), ('foobar', 30)] + assert list(trie.keys()) == ['bar', 'foo', 'foobar'] def test_trie_iter(): @@ -241,14 +241,14 @@ def _trie(self): def test_trie_keys_prefix(self): trie = self._trie() - assert trie.keys('foobarz') == ['foobarzartic'] - assert trie.keys('foobarzart') == ['foobarzartic'] - assert trie.keys('foo') == ['foo', 'foobar', 'foobarzartic', 'foovar'] - assert trie.keys('foobar') == ['foobar', 'foobarzartic'] - assert trie.keys('') == [ + assert list(trie.keys('foobarz')) == ['foobarzartic'] + assert list(trie.keys('foobarzart')) == ['foobarzartic'] + assert list(trie.keys('foo')) == ['foo', 'foobar', 'foobarzartic', 'foovar'] + assert list(trie.keys('foobar')) == ['foobar', 'foobarzartic'] + assert list(trie.keys('')) == [ 'bar', 'foo', 'foobar', 'foobarzartic', 'foovar' ] - assert trie.keys('x') == [] + assert list(trie.keys('x')) == [] def test_trie_items_prefix(self): trie = self._trie() From e171708adf0f84065c120c451097af5fe034cf86 Mon Sep 17 00:00:00 2001 From: pacahon Date: Sun, 4 Sep 2016 22:06:37 +0300 Subject: [PATCH 07/13] trie views based on collections.abc --- src/datrie.pyx | 317 +++++++------------ tests/test_trie.py | 25 +- tests/{test_dictview.py => test_trieview.py} | 0 3 files changed, 130 insertions(+), 212 deletions(-) rename tests/{test_dictview.py => test_trieview.py} (100%) diff --git a/src/datrie.pyx b/src/datrie.pyx index ff38233..d8090aa 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -16,14 +16,18 @@ import itertools import warnings import sys import tempfile -from collections import MutableMapping, Set, Iterable, KeysView, \ - ValuesView, ItemsView +from collections import MutableMapping, Set, Sized try: import cPickle as pickle except ImportError: import pickle +try: + base_str = basestring +except NameError: + base_str = str + class DatrieError(Exception): pass @@ -682,67 +686,24 @@ cdef class Trie(BaseTrie): cpdef items(self, unicode prefix=None): """ - Returns a list of this trie's items (``(key,value)`` tuples). + D.items() -> a set-like object providing a view on D's items. If ``prefix`` is not None, returns only the items associated with keys prefixed by ``prefix``. """ - # the following code is - # - # [(k, self._values[v]) for (k,v) in BaseTrie.items(self, prefix)] - # - # but inlined for speed. - - cdef bint success - cdef list res = [] - cdef BaseState state = BaseState(self) - - if prefix is not None: - success = state.walk(prefix) - if not success: - return res - - cdef BaseIterator iter = BaseIterator(state) - - if prefix is None: - while iter.next(): - res.append((iter.key(), self._values[iter.data()])) - else: - while iter.next(): - res.append((prefix+iter.key(), self._values[iter.data()])) - - return res + return TrieItemsView(self, prefix) cpdef values(self, unicode prefix=None): """ - Returns a list of this trie's values. + D.values() -> an object providing a view on D's values If ``prefix`` is not None, returns only the values associated with keys prefixed by ``prefix``. """ - # the following code is - # - # [self._values[v] for v in BaseTrie.values(self, prefix)] - # - # but inlined for speed. + return TrieValuesView(self, prefix) - cdef list res = [] - cdef BaseState state = BaseState(self) - cdef bint success - - if prefix is not None: - success = state.walk(prefix) - if not success: - return res - - cdef BaseIterator iter = BaseIterator(state) - - while iter.next(): - res.append(self._values[iter.data()]) - - return res def longest_prefix_item(self, unicode key, default=RAISE_KEY_ERROR): """ @@ -934,22 +895,22 @@ cdef class Iterator(_TrieIterator): return self._root._trie._index_to_value(data) -cdef class BaseTrieKeysView: - cdef BaseState _state - cdef unicode _prefix +class BaseTrieMappingView(Sized): + + __slots__ = ('_state', '_prefix') - def __init__(self, BaseTrie base_trie, unicode prefix): + def __init__(self, base_trie, prefix=None): cdef BaseState state = BaseState(base_trie) self._state = state self._prefix = prefix - cdef int _rewind_state(self, unicode new_state): + def _rewind_state(self, new_state): """ - Reset state to root. Then if `new_state` is not None, try to walk - to new state. + Reset state to root. Next try to walk to new state, if `new_state` + is not None. """ self._state.rewind() - if new_state is not None: + if new_state is not None and isinstance(new_state, base_str): if not self._state.walk(new_state): return False return True @@ -964,6 +925,22 @@ cdef class BaseTrieKeysView: count += 1 return count + +class BaseTrieKeysView(BaseTrieMappingView, Set): + + __slots__ = () + + @classmethod + def _from_iterable(cls, it): + return set(it) + + def __contains__(self, item): + if self._prefix and not item.startswith(self._prefix): + return False + if self._rewind_state(item) and self._state.is_terminal(): + return True + return False + def __iter__(self): if not self._rewind_state(self._prefix): raise StopIteration @@ -974,118 +951,37 @@ cdef class BaseTrieKeysView: else: yield self._prefix + it.key() - def __contains__(self, item): - if self._prefix and not item.startswith(self._prefix): - return False - if self._rewind_state(item) and self._state.is_terminal(): + def __eq__(self, other): + # Works faster than Set implementation due to one linear passing + if other is self: return True - return False - - def __richcmp__(self, other, int op): - if op == 0: # < or __lt__ - # Test whether the set is a proper subset of other, that is, - # `set <= other and set != other`. - if not isinstance(other, Set): - return NotImplemented - return len(self) < len(other) and self._issubset(other) - elif op == 1: # <= or __le__ - # s.issubset(t) - test whether every element in s is in t - if not isinstance(other, Set): - return NotImplemented - return self._issubset(other) - elif op == 2: # == - if other is self: - return True - elif not isinstance(other, Set): - # No TypeError for equality - return False - count = 0 - for elem in self: - count += 1 - if elem not in other: - return False - return count == len(other) - # return len(self) == len(other) and self._issubset(other) - elif op == 3: # != - return not (self == other) - elif op == 4: # > or __gt__ - # set > other - test whether the `set` is a proper superset - # of `other`, that is, set >= other and set != other. - if not isinstance(other, Set): - return NotImplemented - return len(self) > len(other) and self._issuperset(other) - elif op == 5: # >= or __ge__ - # s.issuperset(t) - test whether every element in t is in s - if not isinstance(other, Set): - return NotImplemented - return self._issuperset(other) - - cpdef bool _issubset(self, other): - """s._issubset(t) - test whether every element in s is in t""" - if len(self) > len(other): + elif not isinstance(other, Set): + # No TypeError for equality return False + count = 0 for elem in self: + count += 1 if elem not in other: return False - return True - - cpdef bool _issuperset(self, other): - """s._issuperset(t) - test whether every element in t is in s""" - if len(self) < len(other): - return False - try: - for elem in other: - if elem not in self: - return False - except TypeError: - return False - return True + return count == len(other) - # Note: For KeysView explicitly used set in `_from_iterable` method - def __and__(self, other): # intersection, & - """Return a new set with elements common to dict_view and `other`.""" - if other is self: - return set(self) - elif not isinstance(other, Iterable): - return NotImplemented - return {key for key in self if key in other} - def __or__(self, other): # union, | - if other is self: - return set(self) - elif not isinstance(other, Iterable): - return NotImplemented - return {e for e in itertools.chain(self, other)} - - def __sub__(self, other): # difference, - - if not isinstance(other, Set): - if not isinstance(other, Iterable): - return NotImplemented - other = set(other) - return set(value for value in self if value not in other) - - def __xor__(self, other): - # symmetric_difference, set ^ other - # Return a new set with elements in either the set or other but not both. - if not isinstance(other, Set): - if not isinstance(other, Iterable): - return NotImplemented - other = set(other) - return (self - other) | (other - self) - - def isdisjoint(self, other): - """Return True if keys view and `other` have a null intersection.""" - if other is self: - return False - for value in other: - if value in self: - return False - return True +class BaseTrieItemsView(BaseTrieMappingView, Set): + __slots__ = () -class BaseTrieItemsView(BaseTrieKeysView): + @classmethod + def _from_iterable(cls, it): + return set(it) - __slots__ = () + def __contains__(self, item): + key, value = item + if self._prefix and not key.startswith(self._prefix): + return False + if self._rewind_state(key) and self._state.is_terminal(): + v = self._state.data() + return v is value or v == value + return False def __iter__(self): if not self._rewind_state(self._prefix): @@ -1097,53 +993,76 @@ class BaseTrieItemsView(BaseTrieKeysView): else: yield (self._prefix + it.key(), it.data()) - def __contains__(self, item): - key, value = item - if self._prefix and not key.startswith(self._prefix): + +class BaseTrieValuesView(BaseTrieMappingView): + + __slots__ = () + + def __contains__(self, value): + if self._prefix and not value.startswith(self._prefix): return False - if self._rewind_state(key) and self._state.is_terminal(): - v = self._state.data() - return v is value or v == value + for v in self: + if v is value or v == value: + return True return False + def __iter__(self): + if not self._rewind_state(self._prefix): + raise StopIteration + cdef BaseIterator it = BaseIterator(self._state) + while it.next(): + yield it.data() -# FIXME: copy paste from BaseTrieKeysView in most?! -cdef class BaseTrieValuesView: - cdef BaseState _state - cdef unicode _prefix - def __init__(self, BaseTrie base_trie, unicode prefix): - cdef BaseState state = BaseState(base_trie) +class TrieMappingView(BaseTrieMappingView): + + __slots__ = () + + def __init__(self, base_trie, prefix=None): + cdef State state = State(base_trie) self._state = state self._prefix = prefix - cdef int _rewind_state(self, unicode new_state): - """ - Reset state to root. Then if `new_state` is not None, try to walk - to new state. - """ - self._state.rewind() - if new_state is not None: - if not self._state.walk(new_state): - return False - return True - def __len__(self): - """O(n) in current implementation""" - cdef int count = 0 - cdef _TrieIterator it - if self._rewind_state(self._prefix): - it = _TrieIterator(self._state) - while it.next(): - count += 1 - return count +class TrieKeysView(BaseTrieKeysView): + pass + + +class TrieItemsView(TrieMappingView, Set): + + __slots__ = () + + @classmethod + def _from_iterable(cls, it): + return set(it) + + def __contains__(self, item): + key, value = item + if self._prefix and not key.startswith(self._prefix): + return False + if self._rewind_state(key) and self._state.is_terminal(): + v = self._state.data() + return v is value or v == value + return False def __iter__(self): if not self._rewind_state(self._prefix): raise StopIteration - cdef BaseIterator it = BaseIterator(self._state) + cdef Iterator it = Iterator(self._state) while it.next(): - yield it.data() + if self._prefix is None: + yield (it.key(), it.data()) + else: + yield (self._prefix + it.key(), it.data()) + + +class TrieValuesView(TrieMappingView): + + __slots__ = () + + @classmethod + def _from_iterable(cls, it): + return set(it) def __contains__(self, value): if self._prefix and not value.startswith(self._prefix): @@ -1153,11 +1072,12 @@ cdef class BaseTrieValuesView: return True return False -# FIXME: or add self.trie to BaseTrieKeysView? -class TrieKeysView(BaseTrieKeysView): - def __init__(self, BaseTrie base_trie, unicode prefix): - self.trie = base_trie - super(TrieKeysView, self).__init__(base_trie, prefix) + def __iter__(self): + if not self._rewind_state(self._prefix): + raise StopIteration + cdef Iterator it = Iterator(self._state) + while it.next(): + yield it.data() cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: @@ -1325,6 +1245,3 @@ def new(alphabet=None, ranges=None, AlphaMap alpha_map=None): MutableMapping.register(Trie) MutableMapping.register(BaseTrie) -KeysView.register(BaseTrieKeysView) -ItemsView.register(BaseTrieItemsView) -ValuesView.register(BaseTrieValuesView) diff --git a/tests/test_trie.py b/tests/test_trie.py index ec553ce..f6ec663 100644 --- a/tests/test_trie.py +++ b/tests/test_trie.py @@ -252,26 +252,27 @@ def test_trie_keys_prefix(self): def test_trie_items_prefix(self): trie = self._trie() - assert trie.items('foobarz') == [('foobarzartic', None)] - assert trie.items('foobarzart') == [('foobarzartic', None)] - assert trie.items('foo') == [ + assert list(trie.items('foobarz')) == [('foobarzartic', None)] + assert list(trie.items('foobarzart')) == [('foobarzartic', None)] + assert list(trie.items('foo')) == [ ('foo', 10), ('foobar', 30), ('foobarzartic', None), ('foovar', 40) ] - assert trie.items('foobar') == [('foobar', 30), ('foobarzartic', None)] - assert trie.items('') == [ + assert list(trie.items('foobar')) == [ + ('foobar', 30), ('foobarzartic', None)] + assert list(trie.items('')) == [ ('bar', 20), ('foo', 10), ('foobar', 30), ('foobarzartic', None), ('foovar', 40) ] - assert trie.items('x') == [] + assert list(trie.items('x')) == [] def test_trie_values_prefix(self): trie = self._trie() - assert trie.values('foobarz') == [None] - assert trie.values('foobarzart') == [None] - assert trie.values('foo') == [10, 30, None, 40] - assert trie.values('foobar') == [30, None] - assert trie.values('') == [20, 10, 30, None, 40] - assert trie.values('x') == [] + assert list(trie.values('foobarz')) == [None] + assert list(trie.values('foobarzart')) == [None] + assert list(trie.values('foo')) == [10, 30, None, 40] + assert list(trie.values('foobar')) == [30, None] + assert list(trie.values('')) == [20, 10, 30, None, 40] + assert list(trie.values('x')) == [] class TestPrefixSearch(object): diff --git a/tests/test_dictview.py b/tests/test_trieview.py similarity index 100% rename from tests/test_dictview.py rename to tests/test_trieview.py From bf4881da38b4e4b50a7b66273356c5b7d43fc68e Mon Sep 17 00:00:00 2001 From: pacahon Date: Sun, 4 Sep 2016 22:18:34 +0300 Subject: [PATCH 08/13] fix _rewind_state method (WIP: add test) --- src/datrie.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index d8090aa..0799978 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -904,14 +904,16 @@ class BaseTrieMappingView(Sized): self._state = state self._prefix = prefix + # TODO: Add test to check cases where `new_state` is not str def _rewind_state(self, new_state): """ Reset state to root. Next try to walk to new state, if `new_state` is not None. """ self._state.rewind() - if new_state is not None and isinstance(new_state, base_str): - if not self._state.walk(new_state): + if new_state is not None: + if (not isinstance(new_state, base_str) or + not self._state.walk(new_state)): return False return True From 6a1dc4b3a2b934a06dade85b2a0eceb356c71c65 Mon Sep 17 00:00:00 2001 From: pacahon Date: Thu, 8 Sep 2016 01:13:53 +0300 Subject: [PATCH 09/13] WIP: trie size support --- libdatrie/datrie/libdatrie.def | 1 + libdatrie/datrie/trie.c | 35 +++++++++++++++++++++ libdatrie/datrie/trie.h | 6 ++++ libdatrie/tests/test_store-retrieve.c | 44 +++++++++++++++++++++++++++ src/datrie.pyx | 14 ++------- 5 files changed, 88 insertions(+), 12 deletions(-) diff --git a/libdatrie/datrie/libdatrie.def b/libdatrie/datrie/libdatrie.def index 185bda5..a0f0566 100644 --- a/libdatrie/datrie/libdatrie.def +++ b/libdatrie/datrie/libdatrie.def @@ -10,6 +10,7 @@ trie_fread trie_free trie_save trie_fwrite +trie_size trie_is_dirty trie_retrieve trie_store diff --git a/libdatrie/datrie/trie.c b/libdatrie/datrie/trie.c index 37e95a6..05250e1 100644 --- a/libdatrie/datrie/trie.c +++ b/libdatrie/datrie/trie.c @@ -44,6 +44,7 @@ struct _Trie { DArray *da; Tail *tail; + uint32 size; Bool is_dirty; }; @@ -133,6 +134,7 @@ trie_new (const AlphaMap *alpha_map) if (UNLIKELY (!trie->tail)) goto exit_da_created; + trie->size = 0; trie->is_dirty = TRUE; return trie; @@ -203,6 +205,11 @@ trie_fread (FILE *file) if (NULL == (trie->tail = tail_fread (file))) goto exit_da_created; + uint32 counter = 0; + if (!trie_enumerate (trie, len_enumerator, &counter)) { + goto exit_trie_created; + } + trie->size = counter; trie->is_dirty = FALSE; return trie; @@ -290,6 +297,19 @@ trie_fwrite (Trie *trie, FILE *file) return 0; } +/** + * @brief Check pending changes + * + * @param trie : the trie object + * + * @return total count of trie keys + */ +uint32 +trie_size (const Trie *trie) +{ + return trie->size; +} + /** * @brief Check pending changes * @@ -431,6 +451,9 @@ trie_store_conditionally (Trie *trie, res = trie_branch_in_branch (trie, s, key_str, data); free (key_str); + if (res) { + trie->size++; + } return res; } if (0 == *p) @@ -455,6 +478,9 @@ trie_store_conditionally (Trie *trie, res = trie_branch_in_tail (trie, s, tail_str, data); free (tail_str); + if (res) { + trie->size++; + } return res; } if (0 == *p) @@ -580,6 +606,7 @@ trie_delete (Trie *trie, const AlphaChar *key) da_set_base (trie->da, s, TRIE_INDEX_ERROR); da_prune (trie->da, s); + trie->size--; trie->is_dirty = TRUE; return TRUE; } @@ -630,6 +657,14 @@ trie_enumerate (const Trie *trie, TrieEnumFunc enum_func, void *user_data) } +Bool +len_enumerator (const AlphaChar *key, TrieData key_data, uint32 *counter_ptr) +{ + (*counter_ptr)++; + return TRUE; +} + + /*-------------------------------* * STEPWISE QUERY OPERATIONS * *-------------------------------*/ diff --git a/libdatrie/datrie/trie.h b/libdatrie/datrie/trie.h index da16483..91398f8 100644 --- a/libdatrie/datrie/trie.h +++ b/libdatrie/datrie/trie.h @@ -129,6 +129,8 @@ int trie_save (Trie *trie, const char *path); int trie_fwrite (Trie *trie, FILE *file); +uint32 trie_size (const Trie *trie); + Bool trie_is_dirty (const Trie *trie); @@ -150,6 +152,10 @@ Bool trie_enumerate (const Trie *trie, TrieEnumFunc enum_func, void *user_data); +Bool len_enumerator (const AlphaChar *key, + TrieData key_data, + uint32 *counter_ptr); + /*-------------------------------* * STEPWISE QUERY OPERATIONS * diff --git a/libdatrie/tests/test_store-retrieve.c b/libdatrie/tests/test_store-retrieve.c index fed7213..13ef34d 100644 --- a/libdatrie/tests/test_store-retrieve.c +++ b/libdatrie/tests/test_store-retrieve.c @@ -48,6 +48,19 @@ main () goto err_trie_not_created; } + msg_step ("Check initial trie size"); + if (trie_size(test_trie) != 0) { + printf ("Wrong trie size; expected 0, got %d.\n", trie_size(test_trie)); + goto err_trie_size; + } + + msg_step ("Delete non-existent key from trie and check size"); + trie_delete (test_trie, (AlphaChar *)L"a"); + if (trie_size(test_trie) != 0) { + printf ("Wrong trie size; expected 0, got %d.\n", trie_size(test_trie)); + goto err_trie_size; + } + /* store */ msg_step ("Adding data to trie"); for (dict_p = dict_src; dict_p->key; dict_p++) { @@ -58,6 +71,27 @@ main () } } + msg_step ("Check trie size"); + if (trie_size(test_trie) != dict_src_n_entries()) { + printf ("Wrong trie size; expected %d, got %d.\n", + dict_src_n_entries(), trie_size(test_trie)); + goto err_trie_size; + } + + msg_step ("Update existing trie element and check trie size"); + if (!trie_store (test_trie, dict_src[1].key, dict_src[1].data)) { + printf ("Failed to add key '%ls', data %d.\n", + dict_src[1].key, dict_src[1].data); + goto err_trie_created; + } + if (trie_size(test_trie) != dict_src_n_entries()) { + printf ("Wrong trie size; expected %d, got %d.\n", + dict_src_n_entries(), trie_size(test_trie)); + goto err_trie_size; + } + + // TODO: add key with wrong alphabet and check size? + /* retrieve */ msg_step ("Retrieving data from trie"); is_failed = FALSE; @@ -99,6 +133,14 @@ main () goto err_trie_created; } + msg_step ("Check trie size after deleting some entries."); + if (trie_size(test_trie) != (n_entries - (n_entries/3 + 1))) { + printf ("Wrong trie size; expected %d, got %d.\n", + (n_entries - (n_entries/3 + 1)), trie_size(test_trie)); + goto err_trie_size; + } + + /* retrieve */ msg_step ("Retrieving data from trie again after deletions"); for (dict_p = dict_src; dict_p->key; dict_p++) { @@ -192,6 +234,8 @@ main () trie_state_free (trie_root_state); err_trie_created: trie_free (test_trie); +err_trie_size: + trie_free (test_trie); err_trie_not_created: return 1; } diff --git a/src/datrie.pyx b/src/datrie.pyx index 0799978..ccb161f 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -230,18 +230,8 @@ cdef class BaseTrie: if not found: raise KeyError(key) - @staticmethod - cdef int len_enumerator(cdatrie.AlphaChar *key, cdatrie.TrieData key_data, - void *counter_ptr): - (counter_ptr)[0] += 1 - return True - def __len__(self): - cdef int counter = 0 - cdatrie.trie_enumerate(self._c_trie, - (self.len_enumerator), - &counter) - return counter + return cdatrie.trie_size(self._c_trie) def __richcmp__(self, other, int op): if op == 2: # == @@ -954,7 +944,7 @@ class BaseTrieKeysView(BaseTrieMappingView, Set): yield self._prefix + it.key() def __eq__(self, other): - # Works faster than Set implementation due to one linear passing + # Fail-fast version if other is self: return True elif not isinstance(other, Set): From 9b4daa9e8a3a84e6e14211536e315817cd971aa5 Mon Sep 17 00:00:00 2001 From: pacahon Date: Tue, 27 Sep 2016 02:39:32 +0300 Subject: [PATCH 10/13] trie_size test with trie_store_is_absent --- libdatrie/tests/test_store-retrieve.c | 39 +++++++++++++++++++++++++++ src/datrie.pyx | 6 +++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/libdatrie/tests/test_store-retrieve.c b/libdatrie/tests/test_store-retrieve.c index 13ef34d..f967a4d 100644 --- a/libdatrie/tests/test_store-retrieve.c +++ b/libdatrie/tests/test_store-retrieve.c @@ -60,6 +60,21 @@ main () printf ("Wrong trie size; expected 0, got %d.\n", trie_size(test_trie)); goto err_trie_size; } + msg_step ("Add non-existent key with trie_store_if_absent and check size"); + if (!trie_store_if_absent (test_trie, (AlphaChar *)L"a", TRIE_DATA_UNREAD)) { + printf ("Failed to add non-existing key '%ls'.\n", (AlphaChar *)L"a"); + goto err_trie_created; + } + if (trie_size(test_trie) != 1) { + printf ("Wrong trie size; expected 1, got %d.\n", trie_size(test_trie)); + goto err_trie_size; + } + msg_step ("Delete existing key from trie and check size"); + trie_delete (test_trie, (AlphaChar *)L"a"); + if (trie_size(test_trie) != 0) { + printf ("Wrong trie size; expected 0, got %d.\n", trie_size(test_trie)); + goto err_trie_size; + } /* store */ msg_step ("Adding data to trie"); @@ -90,6 +105,30 @@ main () goto err_trie_size; } + msg_step ("Update existing trie element with trie_store_if_absent and check trie size"); + if (trie_store_if_absent (test_trie, dict_src[1].key, dict_src[1].data)) { + printf ("Value for existing key '%ls' was updated with trie_store_if_absent.\n", + dict_src[1].key); + goto err_trie_created; + } + if (trie_size(test_trie) != dict_src_n_entries()) { + printf ("Wrong trie size; expected %d, got %d.\n", + dict_src_n_entries(), trie_size(test_trie)); + goto err_trie_size; + } + + msg_step ("Add trie element with wrong alphabet and check trie size"); + if (trie_store (test_trie, (AlphaChar *)L"я", TRIE_DATA_UNREAD)) { + printf ("Key '%ls' with wrong alphabet was added.\n", + (AlphaChar *)L"я"); + goto err_trie_created; + } + if (trie_size(test_trie) != dict_src_n_entries()) { + printf ("Wrong trie size; expected %d, got %d.\n", + dict_src_n_entries(), trie_size(test_trie)); + goto err_trie_size; + } + // TODO: add key with wrong alphabet and check size? /* retrieve */ diff --git a/src/datrie.pyx b/src/datrie.pyx index ccb161f..f48ab43 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -894,7 +894,7 @@ class BaseTrieMappingView(Sized): self._state = state self._prefix = prefix - # TODO: Add test to check cases where `new_state` is not str + # TODO: Add test to check cases where `new_state` is not str? def _rewind_state(self, new_state): """ Reset state to root. Next try to walk to new state, if `new_state` @@ -908,7 +908,9 @@ class BaseTrieMappingView(Sized): return True def __len__(self): - """O(n) in current implementation""" + """O(n) if prefix is defined""" + if self._prefix is None: + return len(self._state._trie) cdef int count = 0 cdef _TrieIterator it if self._rewind_state(self._prefix): From 6f96c0a27a6698b9ac394f4f26f529dcae5aa08e Mon Sep 17 00:00:00 2001 From: pacahon Date: Sun, 2 Oct 2016 22:12:57 +0300 Subject: [PATCH 11/13] add get_tree method to state --- src/datrie.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index f48ab43..0fb8959 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -770,6 +770,9 @@ cdef class _TrieState: if self._state is not NULL: cdatrie.trie_state_free(self._state) + cpdef get_tree(self): + return self._trie + cpdef walk(self, unicode to): cdef bint res for ch in to: @@ -910,7 +913,7 @@ class BaseTrieMappingView(Sized): def __len__(self): """O(n) if prefix is defined""" if self._prefix is None: - return len(self._state._trie) + return len(self._state.get_tree()) cdef int count = 0 cdef _TrieIterator it if self._rewind_state(self._prefix): From 10b0ab5e0196f9f065f9ab3c56cbab741a6467f5 Mon Sep 17 00:00:00 2001 From: pacahon Date: Sun, 2 Oct 2016 22:46:02 +0300 Subject: [PATCH 12/13] cosmetics --- tests/test_trieview.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_trieview.py b/tests/test_trieview.py index 8e30c59..201479b 100644 --- a/tests/test_trieview.py +++ b/tests/test_trieview.py @@ -14,7 +14,6 @@ def test_keys_empty(): assert len(keys) == 0 -# TODO: Can I use py.test fixtures here? def test_keys_iter(): trie = datrie.BaseTrie(string.printable) trie["1"] = 1 @@ -56,6 +55,14 @@ def test_keys_contains(): keys = trie.keys(prefix="prefix1") assert "prefix1_1" in keys assert "prefix2_1" not in keys + trie["1"] = 1 + keys = trie.keys() + assert "1" in keys + assert 1 not in keys + assert [1] not in keys + items = trie.items() + assert ("1", 1) in items + assert (1, 1) not in items def test_keys_len(): From 8f49e6e8f10b5bb86e602c324925dd376e6d0b57 Mon Sep 17 00:00:00 2001 From: pacahon Date: Sun, 2 Oct 2016 22:49:55 +0300 Subject: [PATCH 13/13] remove stale TODO --- src/datrie.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datrie.pyx b/src/datrie.pyx index 0fb8959..42bae13 100644 --- a/src/datrie.pyx +++ b/src/datrie.pyx @@ -897,7 +897,6 @@ class BaseTrieMappingView(Sized): self._state = state self._prefix = prefix - # TODO: Add test to check cases where `new_state` is not str? def _rewind_state(self, new_state): """ Reset state to root. Next try to walk to new state, if `new_state`