From 5f798b44900b85c41e241a94165f36068eee89f7 Mon Sep 17 00:00:00 2001 From: Changaco Date: Sun, 16 Jul 2017 23:07:57 +0200 Subject: [PATCH 1/2] don't fail when a duplicate file is updated --- legi/tar2sqlite.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/legi/tar2sqlite.py b/legi/tar2sqlite.py index e7484b6..1e2a6b7 100755 --- a/legi/tar2sqlite.py +++ b/legi/tar2sqlite.py @@ -252,8 +252,8 @@ def count_one(k): 'other_cid': text_cid, 'other_dossier': dossier, 'other_mtime': mtime, - }) - count_one('insert into duplicate_files') + }, replace=True) + count_one('upsert into duplicate_files') elif prev_mtime == mtime: skipped += 1 continue @@ -381,8 +381,8 @@ def count_one(k): 'other_cid': prev_cid, 'other_dossier': prev_dossier, 'other_mtime': prev_mtime, - }) - count_one('insert into duplicate_files') + }, replace=True) + count_one('upsert into duplicate_files') continue attrs['dossier'] = dossier From ea79f61f6a81f11e588ee87e8b7beb6cee64b0b2 Mon Sep 17 00:00:00 2001 From: Changaco Date: Sun, 16 Jul 2017 23:09:34 +0200 Subject: [PATCH 2/2] work around bug in lxml's incremental parser lxml is applying basic UTF8 decoding to each chunk, which fails when a chunk ends in the middle of an UTF8 sequence ``` Traceback (most recent call last): File "legi/tar2sqlite.py", line 510, in main() File "legi/tar2sqlite.py", line 486, in main process_archive(db, args.directory + '/' + archive_name) File "legi/tar2sqlite.py", line 262, in process_archive xml.feed(block) File "src/lxml/parser.pxi", line 1217, in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:114563) File "src/lxml/parser.pxi", line 1339, in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:114436) File "src/lxml/parser.pxi", line 586, in lxml.etree._ParserContext._handleParseResult (src/lxml/lxml.etree.c:105777) File "src/lxml/parser.pxi", line 595, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:105896) File "src/lxml/parser.pxi", line 706, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:107604) File "src/lxml/parser.pxi", line 635, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:106458) File "", line 227 lxml.etree.XMLSyntaxError: Input is not proper UTF-8, indicate encoding ! Bytes: 0xC3 EOF, line 227, column 289 ``` --- legi/tar2sqlite.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/legi/tar2sqlite.py b/legi/tar2sqlite.py index 1e2a6b7..6426949 100755 --- a/legi/tar2sqlite.py +++ b/legi/tar2sqlite.py @@ -258,8 +258,7 @@ def count_one(k): skipped += 1 continue - for block in entry.get_blocks(): - xml.feed(block) + xml.feed(b''.join(entry.get_blocks())) root = xml.close() tag = root.tag meta = root.find('META')