From 6c7083b72172f32646843958eedf19cad38eb4b1 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 15 Sep 2023 16:07:43 +0900 Subject: [PATCH] Leniently parse ARC records containing corrupt dates Reported-by: @thomasegense --- src/org/netpreserve/jwarc/MessageParser.java | 12 + src/org/netpreserve/jwarc/WarcParser.java | 245 ++++++++++-------- src/org/netpreserve/jwarc/WarcParser.rl | 20 +- src/org/netpreserve/jwarc/WarcReader.java | 1 + src/org/netpreserve/jwarc/cdx/CdxWriter.java | 6 + .../org/netpreserve/jwarc/WarcParserTest.java | 10 + 6 files changed, 182 insertions(+), 112 deletions(-) diff --git a/src/org/netpreserve/jwarc/MessageParser.java b/src/org/netpreserve/jwarc/MessageParser.java index 278511a..a8c4daa 100644 --- a/src/org/netpreserve/jwarc/MessageParser.java +++ b/src/org/netpreserve/jwarc/MessageParser.java @@ -6,8 +6,20 @@ package org.netpreserve.jwarc; import java.nio.ByteBuffer; +import java.util.function.Consumer; public class MessageParser { + private Consumer warningHandler; + + protected void emitWarning(String message) { + if (warningHandler != null) { + warningHandler.accept(message); + } + } + + void onWarning(Consumer warningHandler) { + this.warningHandler = warningHandler; + } protected static String getErrorContext(String input, int position, int length) { StringBuilder context = new StringBuilder(); diff --git a/src/org/netpreserve/jwarc/WarcParser.java b/src/org/netpreserve/jwarc/WarcParser.java index 753d66d..ed2c167 100644 --- a/src/org/netpreserve/jwarc/WarcParser.java +++ b/src/org/netpreserve/jwarc/WarcParser.java @@ -13,6 +13,7 @@ import java.time.LocalDateTime; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; import java.util.*; import static java.nio.charset.StandardCharsets.ISO_8859_1; @@ -20,7 +21,7 @@ import static java.nio.charset.StandardCharsets.US_ASCII; -// line 142 "WarcParser.rl" +// line 156 "WarcParser.rl" /** @@ -83,7 +84,7 @@ public void parse(ByteBuffer data) { int pe = data.limit(); -// line 87 "WarcParser.java" +// line 88 "WarcParser.java" { int _klen; int _trans = 0; @@ -164,23 +165,23 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] ) switch ( _warc_actions[_acts++] ) { case 0: -// line 26 "WarcParser.rl" +// line 27 "WarcParser.rl" { push(data.get(p)); } break; case 1: -// line 27 "WarcParser.rl" +// line 28 "WarcParser.rl" { major = major * 10 + data.get(p) - '0'; } break; case 2: -// line 28 "WarcParser.rl" +// line 29 "WarcParser.rl" { minor = minor * 10 + data.get(p) - '0'; } break; case 3: -// line 29 "WarcParser.rl" +// line 30 "WarcParser.rl" { endOfText = bufPos; } break; case 4: -// line 31 "WarcParser.rl" +// line 32 "WarcParser.rl" { if (bufPos > 0) { bufPos = endOfText; @@ -189,14 +190,14 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] ) } break; case 5: -// line 38 "WarcParser.rl" +// line 39 "WarcParser.rl" { name = new String(buf, 0, bufPos, US_ASCII); bufPos = 0; } break; case 6: -// line 43 "WarcParser.rl" +// line 44 "WarcParser.rl" { String value = new String(buf, 0, endOfText, UTF_8); headerMap.computeIfAbsent(name, n -> new ArrayList<>()).add(value); @@ -205,7 +206,7 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] ) } break; case 7: -// line 50 "WarcParser.rl" +// line 51 "WarcParser.rl" { String url = new String(buf, 0, bufPos, ISO_8859_1); if (url.startsWith("filedesc://")) { @@ -225,30 +226,42 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] ) } break; case 8: -// line 68 "WarcParser.rl" +// line 69 "WarcParser.rl" { setHeader("WARC-IP-Address", new String(buf, 0, bufPos, US_ASCII)); bufPos = 0; } break; case 9: -// line 73 "WarcParser.rl" +// line 74 "WarcParser.rl" { String arcDate = new String(buf, 0, bufPos, US_ASCII); - Instant instant = LocalDateTime.parse(arcDate, arcTimeFormat).toInstant(ZoneOffset.UTC); - setHeader("WARC-Date", instant.toString()); + // Some WARC files have been seen in the wild with truncated dates + if (arcDate.length() < 14) { + emitWarning("ARC date too short (" + arcDate.length() + " digits)"); + arcDate = arcDate + "00000000000000".substring(arcDate.length()); + } else if (arcDate.length() > 14) { + emitWarning("ARC date too long (" + arcDate.length() + " digits)"); + arcDate = arcDate.substring(0, 14); + } + try { + Instant instant = LocalDateTime.parse(arcDate, arcTimeFormat).toInstant(ZoneOffset.UTC); + setHeader("WARC-Date", instant.toString()); + } catch (DateTimeParseException e) { + emitWarning("ARC date not parsable"); + } bufPos = 0; } break; case 10: -// line 80 "WarcParser.rl" +// line 93 "WarcParser.rl" { setHeader("Content-Length", new String(buf, 0, bufPos, US_ASCII)); bufPos = 0; } break; case 11: -// line 85 "WarcParser.rl" +// line 98 "WarcParser.rl" { protocol = "ARC"; major = 1; @@ -256,10 +269,10 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] ) } break; case 12: -// line 140 "WarcParser.rl" +// line 154 "WarcParser.rl" { { p += 1; _goto_targ = 5; if (true) continue _goto;} } break; -// line 263 "WarcParser.java" +// line 276 "WarcParser.java" } } } @@ -279,7 +292,7 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] ) break; } } -// line 204 "WarcParser.rl" +// line 218 "WarcParser.rl" position += p - data.position(); data.position(p); @@ -333,7 +346,7 @@ private void setHeader(String name, String value) { } -// line 337 "WarcParser.java" +// line 350 "WarcParser.java" private static byte[] init__warc_actions_0() { return new byte [] { @@ -352,11 +365,12 @@ private static short[] init__warc_key_offsets_0() 0, 0, 3, 4, 5, 6, 7, 9, 12, 14, 17, 18, 34, 35, 51, 57, 58, 76, 82, 88, 94, 97, 99, 101, 104, 106, 109, 111, 114, 116, 119, 121, 123, 125, 127, 129, - 131, 133, 135, 137, 139, 141, 143, 145, 147, 148, 165, 167, - 169, 172, 188, 205, 224, 228, 233, 236, 253, 269, 284, 302, - 309, 312, 316, 334, 351, 368, 386, 403, 412, 423, 435, 441, - 444, 445, 448, 449, 452, 453, 456, 457, 473, 474, 490, 496, - 497, 515, 521, 527, 533, 533 + 131, 133, 135, 138, 155, 157, 159, 162, 178, 195, 214, 218, + 223, 226, 243, 259, 274, 292, 299, 302, 306, 324, 341, 358, + 376, 393, 402, 413, 425, 431, 434, 437, 440, 443, 446, 449, + 452, 455, 458, 461, 464, 467, 470, 473, 476, 479, 482, 485, + 488, 489, 492, 493, 496, 497, 500, 501, 504, 505, 521, 522, + 538, 544, 545, 563, 569, 575, 581, 581 }; } @@ -377,32 +391,36 @@ private static char[] init__warc_trans_keys_0() 122, 10, 32, 48, 57, 46, 48, 57, 48, 57, 46, 48, 57, 48, 57, 46, 48, 57, 48, 57, 32, 48, 57, 48, 57, 48, 57, 48, 57, 48, 57, 48, 57, 48, 57, 48, - 57, 48, 57, 48, 57, 48, 57, 48, 57, 48, 57, 48, - 57, 48, 57, 32, 10, 32, 33, 124, 126, 35, 39, 42, - 43, 45, 46, 48, 57, 65, 90, 94, 122, 10, 32, 48, - 57, 10, 48, 57, 10, 32, 33, 47, 124, 126, 35, 39, - 42, 43, 45, 57, 65, 90, 94, 122, 10, 32, 33, 124, - 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, 90, 94, - 122, 9, 10, 32, 33, 59, 124, 126, 35, 39, 42, 43, - 45, 46, 48, 57, 65, 90, 94, 122, 9, 10, 32, 59, - 9, 32, 59, 48, 57, 9, 32, 59, 9, 32, 33, 124, - 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, 90, 94, - 122, 33, 61, 124, 126, 35, 39, 42, 43, 45, 46, 48, - 57, 65, 90, 94, 122, 34, 124, 126, 33, 39, 42, 43, - 45, 46, 48, 57, 65, 90, 94, 122, 9, 32, 33, 59, - 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, 90, - 94, 122, 9, 34, 92, 32, 126, 128, 255, 9, 32, 59, - 0, 191, 194, 244, 9, 10, 32, 33, 124, 126, 35, 39, - 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 9, 32, + 57, 48, 57, 32, 48, 57, 10, 32, 33, 124, 126, 35, + 39, 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 10, + 32, 48, 57, 10, 48, 57, 10, 32, 33, 47, 124, 126, + 35, 39, 42, 43, 45, 57, 65, 90, 94, 122, 10, 32, 33, 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, - 90, 94, 122, 10, 33, 61, 124, 126, 35, 39, 42, 43, - 45, 46, 48, 57, 65, 90, 94, 122, 10, 32, 33, 61, - 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, 90, - 94, 122, 10, 32, 34, 124, 126, 33, 39, 42, 43, 45, - 46, 48, 57, 65, 90, 94, 122, 9, 10, 32, 34, 92, - 33, 126, 128, 255, 9, 34, 92, 32, 47, 48, 57, 58, - 126, 128, 255, 9, 10, 34, 92, 32, 47, 48, 57, 58, - 126, 128, 255, 10, 32, 0, 191, 194, 244, 32, 48, 57, + 90, 94, 122, 9, 10, 32, 33, 59, 124, 126, 35, 39, + 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 9, 10, + 32, 59, 9, 32, 59, 48, 57, 9, 32, 59, 9, 32, + 33, 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, + 90, 94, 122, 33, 61, 124, 126, 35, 39, 42, 43, 45, + 46, 48, 57, 65, 90, 94, 122, 34, 124, 126, 33, 39, + 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 9, 32, + 33, 59, 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, + 65, 90, 94, 122, 9, 34, 92, 32, 126, 128, 255, 9, + 32, 59, 0, 191, 194, 244, 9, 10, 32, 33, 124, 126, + 35, 39, 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, + 9, 32, 33, 124, 126, 35, 39, 42, 43, 45, 46, 48, + 57, 65, 90, 94, 122, 10, 33, 61, 124, 126, 35, 39, + 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 10, 32, + 33, 61, 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, + 65, 90, 94, 122, 10, 32, 34, 124, 126, 33, 39, 42, + 43, 45, 46, 48, 57, 65, 90, 94, 122, 9, 10, 32, + 34, 92, 33, 126, 128, 255, 9, 34, 92, 32, 47, 48, + 57, 58, 126, 128, 255, 9, 10, 34, 92, 32, 47, 48, + 57, 58, 126, 128, 255, 10, 32, 0, 191, 194, 244, 32, + 48, 57, 32, 48, 57, 32, 48, 57, 32, 48, 57, 32, + 48, 57, 32, 48, 57, 32, 48, 57, 32, 48, 57, 32, + 48, 57, 32, 48, 57, 32, 48, 57, 32, 48, 57, 32, + 48, 57, 32, 48, 57, 32, 48, 57, 32, 48, 57, 32, + 48, 57, 32, 48, 57, 32, 48, 57, 32, 32, 48, 57, 32, 46, 48, 57, 46, 46, 48, 57, 46, 46, 48, 57, 46, 13, 33, 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 10, 33, 58, 124, 126, 35, 39, @@ -423,11 +441,12 @@ private static byte[] init__warc_single_lengths_0() 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 4, 1, 4, 4, 1, 6, 4, 4, 4, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 2, 0, - 1, 6, 5, 7, 4, 3, 3, 5, 4, 3, 6, 3, - 3, 0, 6, 5, 5, 6, 5, 5, 3, 4, 2, 1, - 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 4, 1, - 6, 4, 4, 4, 0, 0 + 0, 0, 1, 5, 2, 0, 1, 6, 5, 7, 4, 3, + 3, 5, 4, 3, 6, 3, 3, 0, 6, 5, 5, 6, + 5, 5, 3, 4, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, + 4, 1, 6, 4, 4, 4, 0, 0 }; } @@ -440,11 +459,12 @@ private static byte[] init__warc_range_lengths_0() 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 6, 0, 6, 1, 0, 6, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 0, 1, - 1, 5, 6, 6, 0, 1, 0, 6, 6, 6, 6, 2, - 0, 2, 6, 6, 6, 6, 6, 2, 4, 4, 2, 1, - 0, 1, 0, 1, 0, 1, 0, 6, 0, 6, 1, 0, - 6, 1, 1, 1, 0, 0 + 1, 1, 1, 6, 0, 1, 1, 5, 6, 6, 0, 1, + 0, 6, 6, 6, 6, 2, 0, 2, 6, 6, 6, 6, + 6, 2, 4, 4, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 6, 0, 6, + 1, 0, 6, 1, 1, 1, 0, 0 }; } @@ -457,11 +477,12 @@ private static short[] init__warc_index_offsets_0() 0, 0, 3, 5, 7, 9, 11, 13, 16, 18, 21, 23, 34, 36, 47, 53, 55, 68, 74, 80, 86, 89, 92, 94, 97, 99, 102, 104, 107, 109, 112, 114, 116, 118, 120, 122, - 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 154, 157, - 159, 162, 174, 186, 200, 205, 210, 214, 226, 237, 247, 260, - 266, 270, 273, 286, 298, 310, 323, 335, 343, 351, 360, 365, - 368, 370, 373, 375, 378, 380, 383, 385, 396, 398, 409, 415, - 417, 430, 436, 442, 448, 449 + 124, 126, 128, 131, 143, 146, 148, 151, 163, 175, 189, 194, + 199, 203, 215, 226, 236, 249, 255, 259, 262, 275, 287, 299, + 312, 324, 332, 340, 349, 354, 357, 360, 363, 366, 369, 372, + 375, 378, 381, 384, 387, 390, 393, 396, 399, 402, 405, 408, + 411, 413, 416, 418, 421, 423, 426, 428, 431, 433, 444, 446, + 457, 463, 465, 478, 484, 490, 496, 497 }; } @@ -481,34 +502,38 @@ private static byte[] init__warc_indicies_0() 1, 18, 28, 2, 1, 1, 29, 28, 30, 1, 31, 32, 1, 33, 1, 34, 35, 1, 36, 1, 37, 38, 1, 39, 1, 40, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, - 46, 1, 47, 1, 48, 1, 49, 1, 50, 1, 51, 1, - 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 1, 58, - 59, 59, 59, 59, 59, 59, 59, 59, 59, 57, 1, 58, - 57, 60, 1, 61, 60, 1, 1, 58, 59, 62, 59, 59, - 59, 59, 59, 59, 59, 57, 1, 58, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 57, 64, 1, 65, 63, 66, 63, - 63, 63, 63, 63, 63, 63, 63, 57, 64, 1, 65, 66, - 57, 67, 67, 68, 60, 1, 67, 67, 68, 1, 68, 68, - 69, 69, 69, 69, 69, 69, 69, 69, 69, 1, 69, 70, - 69, 69, 69, 69, 69, 69, 69, 69, 1, 72, 71, 71, - 71, 71, 71, 71, 71, 71, 1, 67, 65, 71, 68, 71, - 71, 71, 71, 71, 71, 71, 71, 1, 72, 73, 74, 72, - 72, 1, 67, 65, 68, 1, 72, 72, 1, 66, 1, 75, - 76, 76, 76, 76, 76, 76, 76, 76, 76, 57, 68, 68, - 69, 69, 69, 69, 69, 69, 77, 69, 69, 1, 61, 69, - 70, 69, 69, 69, 69, 69, 77, 69, 69, 1, 1, 58, - 76, 78, 76, 76, 76, 76, 76, 76, 76, 76, 57, 1, - 58, 79, 63, 63, 63, 63, 63, 63, 63, 63, 57, 79, - 1, 80, 64, 81, 79, 79, 57, 72, 73, 74, 72, 82, - 72, 72, 1, 72, 61, 73, 74, 72, 82, 72, 72, 1, - 72, 80, 79, 79, 57, 40, 83, 1, 40, 1, 37, 84, - 1, 37, 1, 34, 85, 1, 34, 1, 31, 86, 1, 31, - 1, 87, 88, 88, 88, 88, 88, 88, 88, 88, 88, 1, - 89, 1, 88, 90, 88, 88, 88, 88, 88, 88, 88, 88, - 1, 91, 92, 91, 1, 1, 93, 94, 1, 95, 96, 95, - 97, 97, 97, 97, 97, 97, 97, 97, 97, 1, 95, 98, - 95, 1, 1, 99, 100, 101, 100, 1, 1, 93, 102, 92, - 102, 1, 1, 93, 1, 1, 0 + 46, 1, 47, 1, 48, 1, 49, 1, 50, 51, 1, 1, + 53, 54, 54, 54, 54, 54, 54, 54, 54, 54, 52, 1, + 53, 52, 55, 1, 56, 55, 1, 1, 53, 54, 57, 54, + 54, 54, 54, 54, 54, 54, 52, 1, 53, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 52, 59, 1, 60, 58, 61, + 58, 58, 58, 58, 58, 58, 58, 58, 52, 59, 1, 60, + 61, 52, 62, 62, 63, 55, 1, 62, 62, 63, 1, 63, + 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 1, 64, + 65, 64, 64, 64, 64, 64, 64, 64, 64, 1, 67, 66, + 66, 66, 66, 66, 66, 66, 66, 1, 62, 60, 66, 63, + 66, 66, 66, 66, 66, 66, 66, 66, 1, 67, 68, 69, + 67, 67, 1, 62, 60, 63, 1, 67, 67, 1, 61, 1, + 70, 71, 71, 71, 71, 71, 71, 71, 71, 71, 52, 63, + 63, 64, 64, 64, 64, 64, 64, 72, 64, 64, 1, 56, + 64, 65, 64, 64, 64, 64, 64, 72, 64, 64, 1, 1, + 53, 71, 73, 71, 71, 71, 71, 71, 71, 71, 71, 52, + 1, 53, 74, 58, 58, 58, 58, 58, 58, 58, 58, 52, + 74, 1, 75, 59, 76, 74, 74, 52, 67, 68, 69, 67, + 77, 67, 67, 1, 67, 56, 68, 69, 67, 77, 67, 67, + 1, 67, 75, 74, 74, 52, 50, 78, 1, 50, 79, 1, + 50, 80, 1, 50, 81, 1, 50, 82, 1, 50, 83, 1, + 50, 84, 1, 50, 85, 1, 50, 86, 1, 50, 87, 1, + 50, 88, 1, 50, 89, 1, 50, 90, 1, 50, 91, 1, + 50, 92, 1, 50, 93, 1, 50, 94, 1, 50, 95, 1, + 50, 96, 1, 50, 1, 40, 97, 1, 40, 1, 37, 98, + 1, 37, 1, 34, 99, 1, 34, 1, 31, 100, 1, 31, + 1, 101, 102, 102, 102, 102, 102, 102, 102, 102, 102, 1, + 103, 1, 102, 104, 102, 102, 102, 102, 102, 102, 102, 102, + 1, 105, 106, 105, 1, 1, 107, 108, 1, 109, 110, 109, + 111, 111, 111, 111, 111, 111, 111, 111, 111, 1, 109, 112, + 109, 1, 1, 113, 114, 115, 114, 1, 1, 107, 116, 106, + 116, 1, 1, 107, 1, 1, 0 }; } @@ -519,14 +544,15 @@ private static byte[] init__warc_trans_targs_0() { return new byte [] { 2, 0, 20, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 88, 14, 14, 15, 18, 16, 17, 12, 13, 15, - 18, 19, 15, 19, 21, 22, 23, 24, 77, 25, 26, 75, - 27, 28, 73, 29, 30, 71, 31, 32, 33, 34, 35, 36, - 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, - 48, 88, 50, 51, 52, 53, 62, 54, 55, 56, 57, 58, - 59, 60, 61, 63, 65, 64, 66, 67, 68, 70, 69, 72, - 74, 76, 78, 80, 81, 89, 82, 82, 83, 86, 84, 85, - 80, 81, 83, 86, 87, 83, 87 + 12, 13, 102, 14, 14, 15, 18, 16, 17, 12, 13, 15, + 18, 19, 15, 19, 21, 22, 23, 24, 91, 25, 26, 89, + 27, 28, 87, 29, 30, 85, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 65, 40, 41, 43, 42, 102, 44, 45, 46, + 47, 56, 48, 49, 50, 51, 52, 53, 54, 55, 57, 59, + 58, 60, 61, 62, 64, 63, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, + 84, 86, 88, 90, 92, 94, 95, 103, 96, 96, 97, 100, + 98, 99, 94, 95, 97, 100, 101, 97, 101 }; } @@ -540,11 +566,12 @@ private static byte[] init__warc_trans_actions_0() 0, 1, 21, 11, 0, 0, 1, 0, 0, 13, 29, 9, 26, 23, 7, 1, 1, 15, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 17, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 19, 0, 0, 0, - 1, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, - 1, 1, 1, 0, 1, 0, 11, 0, 0, 1, 0, 0, - 13, 29, 9, 26, 23, 7, 1 + 1, 1, 19, 1, 0, 0, 0, 1, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 0, 11, 0, 0, 1, + 0, 0, 13, 29, 9, 26, 23, 7, 1 }; } @@ -552,12 +579,12 @@ private static byte[] init__warc_trans_actions_0() static final int warc_start = 1; -static final int warc_first_final = 88; +static final int warc_first_final = 102; static final int warc_error = 0; -static final int warc_en_warc_fields = 79; +static final int warc_en_warc_fields = 93; static final int warc_en_any_header = 1; -// line 257 "WarcParser.rl" +// line 271 "WarcParser.rl" } \ No newline at end of file diff --git a/src/org/netpreserve/jwarc/WarcParser.rl b/src/org/netpreserve/jwarc/WarcParser.rl index 4f9efa4..60d2b7e 100644 --- a/src/org/netpreserve/jwarc/WarcParser.rl +++ b/src/org/netpreserve/jwarc/WarcParser.rl @@ -11,6 +11,7 @@ import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; import java.util.*; import static java.nio.charset.StandardCharsets.ISO_8859_1; @@ -72,8 +73,20 @@ action handle_arc_ip { action handle_arc_date { String arcDate = new String(buf, 0, bufPos, US_ASCII); - Instant instant = LocalDateTime.parse(arcDate, arcTimeFormat).toInstant(ZoneOffset.UTC); - setHeader("WARC-Date", instant.toString()); + // Some WARC files have been seen in the wild with truncated dates + if (arcDate.length() < 14) { + emitWarning("ARC date too short (" + arcDate.length() + " digits)"); + arcDate = arcDate + "00000000000000".substring(arcDate.length()); + } else if (arcDate.length() > 14) { + emitWarning("ARC date too long (" + arcDate.length() + " digits)"); + arcDate = arcDate.substring(0, 14); + } + try { + Instant instant = LocalDateTime.parse(arcDate, arcTimeFormat).toInstant(ZoneOffset.UTC); + setHeader("WARC-Date", instant.toString()); + } catch (DateTimeParseException e) { + emitWarning("ARC date not parsable"); + } bufPos = 0; } @@ -131,10 +144,11 @@ arc_url_byte = any - "\n" - " "; arc_url = (lower+ ":" arc_url_byte*) $push %handle_arc_url; arc_ip = (digit{1,3} "." digit{1,3} "." digit{1,3} "." digit{1,3}) $push %handle_arc_ip; arc_date = digit{14} $push %handle_arc_date; +arc_date_lenient = digit{8,28} $push %handle_arc_date; arc_mime = (token ("/" token ( OWS ";" OWS parameter )*)?)?; arc_mime_lenient = arc_mime | (any - " " - "\n")*; arc_length = digit+ $push %handle_arc_length %handle_arc; -arc_header = arc_url " " arc_ip " " arc_date " " arc_mime_lenient " " arc_length "\n"; +arc_header = arc_url " " arc_ip " " arc_date_lenient " " arc_mime_lenient " " arc_length "\n"; warc_fields := named_fields; any_header := (arc_header | warc_header) @{ fbreak; }; diff --git a/src/org/netpreserve/jwarc/WarcReader.java b/src/org/netpreserve/jwarc/WarcReader.java index 2b884b9..60254ef 100644 --- a/src/org/netpreserve/jwarc/WarcReader.java +++ b/src/org/netpreserve/jwarc/WarcReader.java @@ -363,6 +363,7 @@ private void emitWarning(String message) { */ public void onWarning(Consumer warningHandler) { this.warningHandler = warningHandler; + parser.onWarning(warningHandler); } /** diff --git a/src/org/netpreserve/jwarc/cdx/CdxWriter.java b/src/org/netpreserve/jwarc/cdx/CdxWriter.java index 6e319c3..741894f 100644 --- a/src/org/netpreserve/jwarc/cdx/CdxWriter.java +++ b/src/org/netpreserve/jwarc/cdx/CdxWriter.java @@ -100,6 +100,12 @@ public void process(WarcReader reader, String filename) throws IOException { record = reader.next().orElse(null); long length = reader.position() - position; + // skip records without a date, this often occurs in old ARC files with a corrupt date field + if (!capture.headers().first("WARC-Date").isPresent()) { + emitWarning(filename, position, "Skipping record due to missing or invalid date"); + continue; + } + String encodedRequest = null; if (postAppend) { // check for a corresponding request record diff --git a/test/org/netpreserve/jwarc/WarcParserTest.java b/test/org/netpreserve/jwarc/WarcParserTest.java index 4216591..dec7381 100644 --- a/test/org/netpreserve/jwarc/WarcParserTest.java +++ b/test/org/netpreserve/jwarc/WarcParserTest.java @@ -21,6 +21,16 @@ public void testParsingArcWithBogusMime() { assertEquals(Optional.of("494"), parser.headers().sole("Content-Length")); } + @Test + public void testParsingArcWithCorruptDates() { + WarcParser parser = parse("http://example.com/ 1.2.3.4 200012120739 text/html 42\n"); + assertEquals(Optional.of("2000-12-12T07:39:00Z"), parser.headers().first("WARC-Date")); + parser = parse("http://example.com/ 1.2.3.4 2000121207394211 text/html 1942\n"); + assertEquals(Optional.of("2000-12-12T07:39:42Z"), parser.headers().first("WARC-Date")); + parser = parse("http://example.com/ 1.2.3.4 99999999999999 text/html 1942\n"); + assertEquals(Optional.empty(), parser.headers().first("WARC-Date")); + } + private static WarcParser parse(String input) { WarcParser parser = new WarcParser(); parser.parse(ByteBuffer.wrap(input.getBytes(StandardCharsets.ISO_8859_1)));