From ea79f61f6a81f11e588ee87e8b7beb6cee64b0b2 Mon Sep 17 00:00:00 2001 From: Changaco Date: Sun, 16 Jul 2017 23:09:34 +0200 Subject: [PATCH] work around bug in lxml's incremental parser lxml is applying basic UTF8 decoding to each chunk, which fails when a chunk ends in the middle of an UTF8 sequence ``` Traceback (most recent call last): File "legi/tar2sqlite.py", line 510, in main() File "legi/tar2sqlite.py", line 486, in main process_archive(db, args.directory + '/' + archive_name) File "legi/tar2sqlite.py", line 262, in process_archive xml.feed(block) File "src/lxml/parser.pxi", line 1217, in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:114563) File "src/lxml/parser.pxi", line 1339, in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:114436) File "src/lxml/parser.pxi", line 586, in lxml.etree._ParserContext._handleParseResult (src/lxml/lxml.etree.c:105777) File "src/lxml/parser.pxi", line 595, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:105896) File "src/lxml/parser.pxi", line 706, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:107604) File "src/lxml/parser.pxi", line 635, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:106458) File "", line 227 lxml.etree.XMLSyntaxError: Input is not proper UTF-8, indicate encoding ! Bytes: 0xC3 EOF, line 227, column 289 ``` --- legi/tar2sqlite.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/legi/tar2sqlite.py b/legi/tar2sqlite.py index 1e2a6b7..6426949 100755 --- a/legi/tar2sqlite.py +++ b/legi/tar2sqlite.py @@ -258,8 +258,7 @@ def count_one(k): skipped += 1 continue - for block in entry.get_blocks(): - xml.feed(block) + xml.feed(b''.join(entry.get_blocks())) root = xml.close() tag = root.tag meta = root.find('META')