-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathWarcParser.rl
325 lines (275 loc) · 9.81 KB
/
WarcParser.rl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// recompile: ragel -J WarcParser.rl -o WarcParser.java
// diagram: ragel -Vp WarcParser.rl | dot -Tpng | feh -
package org.netpreserve.jwarc;
import java.io.EOFException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.nio.charset.StandardCharsets.US_ASCII;
%%{
machine warc;
getkey (data.get(p) & 0xff);
action push { push(data.get(p)); }
action add_major { major = major * 10 + data.get(p) - '0'; }
action add_minor { minor = minor * 10 + data.get(p) - '0'; }
action end_of_text { endOfText = bufPos; }
action fold {
if (bufPos > 0) {
bufPos = endOfText;
push((byte)' ');
}
}
action handle_name {
name = new String(buf, 0, bufPos, US_ASCII);
bufPos = 0;
}
action handle_value {
String value = new String(buf, 0, endOfText, UTF_8);
headerMap.computeIfAbsent(name, n -> new ArrayList<>()).add(value);
bufPos = 0;
endOfText = 0;
}
action handle_arc_url {
String url = new String(buf, 0, bufPos, ISO_8859_1);
if (url.startsWith("filedesc://")) {
setHeader("WARC-Type", "warcinfo");
setHeader("WARC-Filename", url.substring("filedesc://".length()));
setHeader("Content-Type", "text/plain");
} else if (url.startsWith("dns:")) {
setHeader("WARC-Type", "response");
setHeader("Content-Type", "text/dns");
setHeader("WARC-Target-URI", url);
} else {
setHeader("WARC-Type", "response");
setHeader("Content-Type", "application/http;msgtype=response");
setHeader("WARC-Target-URI", url);
}
bufPos = 0;
}
action handle_arc_ip {
setHeader("WARC-IP-Address", new String(buf, 0, bufPos, US_ASCII));
bufPos = 0;
}
action handle_arc_date {
String arcDate = new String(buf, 0, bufPos, US_ASCII);
// Some WARC files have been seen in the wild with truncated dates
if (arcDate.length() < 14) {
emitWarning("ARC date too short (" + arcDate.length() + " digits)");
arcDate = arcDate + "00000000000000".substring(arcDate.length());
} else if (arcDate.length() > 14) {
emitWarning("ARC date too long (" + arcDate.length() + " digits)");
arcDate = arcDate.substring(0, 14);
}
try {
Instant instant = LocalDateTime.parse(arcDate, arcTimeFormat).toInstant(ZoneOffset.UTC);
setHeader("WARC-Date", instant.toString());
} catch (DateTimeParseException e) {
emitWarning("ARC date not parsable");
}
bufPos = 0;
}
action handle_arc_length {
setHeader("Content-Length", new String(buf, 0, bufPos, US_ASCII));
bufPos = 0;
}
action handle_arc_status {
bufPos = 0;
}
action handle_arc {
protocol = "ARC";
major = 1;
minor = 1;
}
CRLF = "\r\n";
version_major = digit+ $add_major;
version_minor = digit+ $add_minor;
version = "WARC/" version_major "." version_minor CRLF ;
CHAR = 0..0x7f | 0x80..0xbf | 0xc2..0xf4;
CTL = cntrl | 127;
WS = " " | "\t";
RWS = WS+;
OWS = WS*;
LWS = CRLF RWS;
WORD = (any - CTL - WS)+;
TEXT = WORD (RWS WORD)* %end_of_text;
separators = "(" | ")" | "<" | ">" | "@"
| "," | ";" | ":" | "\\" | '"'
| "/" | "[" | "]" | "?" | "="
| "{" | "}" | " " | "\t";
url_byte = alpha | digit | "!" | "$" | "&" | "'" | "(" | ")"
| "*" | "+" | "," | "-" | "." | "/" | ":" | ";"
| "=" | "?" | "@" | "_" | "~" | "%" | 0x80..0xff;
field_name = ((ascii - CTL - separators)+) $push %handle_name;
field_value_first = OWS (TEXT OWS)? $push;
field_value_folded = LWS (TEXT OWS)? >fold $push;
field_value = field_value_first (field_value_folded)*;
named_field = field_name ":" field_value CRLF %handle_value;
named_fields = named_field* CRLF;
warc_header = version named_fields;
CRLF_lenient = "\r"* "\n";
LWS_lenient = CRLF_lenient RWS;
TEXT_lenient = ((any - '\n' - WS) (any - '\n')*)? (any - '\n' - WS - '\r') %end_of_text;
version_lenient = "WARC/" version_major "." version_minor+ CRLF_lenient ;
field_name_lenient = ((any - '\r' - '\n' - ' ' - '\t' - ':') (any - '\r' - '\n' - ':')*) $push %handle_name;
field_value_first_lenient = OWS (TEXT_lenient OWS)? $push;
field_value_folded_lenient = LWS_lenient (TEXT_lenient OWS)? >fold $push;
field_value_lenient = field_value_first_lenient (field_value_folded_lenient)*;
named_field_lenient = field_name_lenient ":" field_value_lenient CRLF_lenient %handle_value;
named_fields_lenient = named_field_lenient* CRLF_lenient;
warc_header_lenient = version_lenient named_fields_lenient;
token = (ascii - CTL - separators)+;
obs_text = 0x80..0xff;
qdtext = "\t" | " " | 0x21 | 0x23..0x5b | 0x5d..0x7e | obs_text;
quoted_pair = "\\" CHAR;
quoted_string = '"' (qdtext | quoted_pair)* '"';
parameter = token "=" (token | quoted_string );
arc_url_byte = any - "\n" - " ";
arc_url = (lower+ ":" arc_url_byte*) $push %handle_arc_url;
arc_ip = ("0" | digit{1,3} "." digit{1,3} "." digit{1,3} "." digit{1,3}) $push %handle_arc_ip;
arc_date = digit{14} $push %handle_arc_date;
arc_date_lenient = digit{8,28} $push %handle_arc_date;
arc_mime = (token ("/" token ( OWS ";" OWS parameter )*)?)?;
arc_mime_lenient = arc_mime | (any - " " - "\n")*;
arc_length = digit+ $push %handle_arc_length %handle_arc;
arc_v2_status = digit{3} $push %handle_arc_status;
arc_v2_checksum = arc_url_byte+;
arc_v2_location = arc_url_byte+;
arc_v2_offset = digit+;
arc_v2_filename = arc_url_byte+;
arc_v2_fields = arc_v2_status " " arc_v2_checksum " " arc_v2_location " " arc_v2_offset " " arc_v2_filename;
arc_header = "\n"{0,3} arc_url " " arc_ip " " arc_date_lenient " " arc_mime_lenient
" " (arc_v2_fields " ")? arc_length "\n";
warc_fields_lenient := named_fields_lenient;
warc_fields := named_fields;
any_header_lenient := (arc_header | warc_header_lenient) @{ fbreak; };
any_header := (arc_header | warc_header) @{ fbreak; };
}%%
/**
* Low-level WARC record parser.
* <p>
* Unless you're doing something advanced (like non-blocking IO) you should use the higher-level {@link WarcReader}
* class instead.
*/
public class WarcParser extends MessageParser {
private int entryState;
private int cs;
private long position;
private byte[] buf = new byte[256];
private int bufPos;
private int endOfText;
private int major;
private int minor;
private String name;
private String protocol = "WARC";
private Map<String,List<String>> headerMap;
private static final DateTimeFormatter arcTimeFormat = DateTimeFormatter.ofPattern("yyyyMMddHHmmss");
public static WarcParser newWarcFieldsParser() {
return new WarcParser(warc_en_warc_fields);
}
public WarcParser() {
this(warc_start);
}
private WarcParser(int entryState) {
this.entryState = entryState;
reset();
}
public void reset() {
cs = entryState;
position = 0;
bufPos = 0;
endOfText = 0;
major = 0;
minor = 0;
name = null;
headerMap = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
if (buf.length > 4096) {
buf = new byte[4096];
}
}
/**
* Sets the lenient mode for the WarcParser.
* <p>
* When enabled, this causes the parser to follow the specification less strictly,
* allowing reading of non-compliant records by:
* <ul>
* <li>permitting ASCII control characters in header field names and values
* <li>allowing lines to end with LF instead of CRLF
* <li>permitting multi-digit WARC minor versions like "0.18"
* </ul>
* Calling this method also resets the state of the parser.
*/
public void setLenient(boolean lenient) {
if (warcFieldsMode()) {
entryState = lenient ? warc_en_warc_fields_lenient : warc_en_warc_fields;
} else {
entryState = lenient ? warc_en_any_header_lenient : warc_start;
}
reset();
}
private boolean warcFieldsMode() {
return entryState == warc_en_warc_fields || entryState == warc_en_warc_fields_lenient;
}
public boolean isFinished() {
return cs >= warc_first_final;
}
public boolean isError() {
return cs == warc_error;
}
public void parse(ByteBuffer data) {
int p = data.position();
int pe = data.limit();
%% write exec;
position += p - data.position();
data.position(p);
}
public boolean parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException {
while (true) {
parse(buffer);
if (isFinished()) {
return true;
}
if (isError()) {
throw new ParsingException("invalid WARC record at position " + position + ": "
+ getErrorContext(buffer, buffer.position(), 40));
}
buffer.compact();
int n = channel.read(buffer);
buffer.flip();
if (n < 0) {
if (position > 0) {
throw new EOFException();
}
return false;
}
}
}
private void push(byte b) {
if (bufPos >= buf.length) {
buf = Arrays.copyOf(buf, buf.length * 2);
}
buf[bufPos++] = b;
}
public MessageHeaders headers() {
return new MessageHeaders(headerMap);
}
public MessageVersion version() {
return new MessageVersion(protocol, major, minor);
}
public long position() {
return position;
}
private void setHeader(String name, String value) {
List<String> list = new ArrayList<>();
list.add(value);
headerMap.put(name, list);
}
%% write data;
}