From fdf5d1928d9fc1cb33930b3657a3530298cfca2f Mon Sep 17 00:00:00 2001 From: cachho Date: Wed, 9 Aug 2023 05:42:30 +0200 Subject: [PATCH] test: added chunker unit tests (#325) --- tests/chunkers/test_text.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/chunkers/test_text.py b/tests/chunkers/test_text.py index 79810c6f93..74f18b90ad 100644 --- a/tests/chunkers/test_text.py +++ b/tests/chunkers/test_text.py @@ -24,6 +24,37 @@ def test_chunks(self): # Additional test cases can be added to cover different scenarios + def test_big_chunksize(self): + """ + Test that if an infinitely high chunk size is used, only one chunk is returned. + """ + chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len) + chunker = TextChunker(config=chunker_config) + text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + + result = chunker.create_chunks(MockLoader(), text) + + documents = result["documents"] + + self.assertEqual(len(documents), 1) + + def test_small_chunksize(self): + """ + Test that if a chunk size of one is used, every character is a chunk. + """ + chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len) + chunker = TextChunker(config=chunker_config) + # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters. + text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c""" + + result = chunker.create_chunks(MockLoader(), text) + + documents = result["documents"] + + print(documents) + + self.assertEqual(len(documents), len(text)) + class MockLoader: def load_data(self, src):