-
-
Notifications
You must be signed in to change notification settings - Fork 34.5k
Expand file tree
/
Copy pathtokenizer.c
More file actions
2132 lines (1976 loc) · 61.5 KB
/
tokenizer.c
File metadata and controls
2132 lines (1976 loc) · 61.5 KB
Edit and raw actions
OlderNewer
1
2
/* Tokenizer implementation */
3
4
#define PY_SSIZE_T_CLEAN
5
#include "Python.h"
6
#include "pycore_call.h" // _PyObject_CallNoArgs()
7
8
#include <ctype.h>
9
#include <assert.h>
10
11
#include "tokenizer.h"
12
#include "errcode.h"
13
14
#include "unicodeobject.h"
15
#include "bytesobject.h"
16
#include "fileobject.h"
17
#include "abstract.h"
18
19
/* Alternate tab spacing */
20
#define ALTTABSIZE 1
21
22
#define is_potential_identifier_start(c) (\
23
(c >= 'a' && c <= 'z')\
24
|| (c >= 'A' && c <= 'Z')\
25
|| c == '_'\
26
|| (c >= 128))
27
28
#define is_potential_identifier_char(c) (\
29
(c >= 'a' && c <= 'z')\
30
|| (c >= 'A' && c <= 'Z')\
31
|| (c >= '0' && c <= '9')\
32
|| c == '_'\
33
|| (c >= 128))
34
35
36
/* Don't ever change this -- it would break the portability of Python code */
37
#define TABSIZE 8
38
39
/* Forward */
40
static struct tok_state *tok_new(void);
41
static int tok_nextc(struct tok_state *tok);
42
static void tok_backup(struct tok_state *tok, int c);
43
44
45
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
46
tokenizing. */
47
static const char* type_comment_prefix = "# type: ";
48
49
/* Create and initialize a new tok_state structure */
50
51
static struct tok_state *
52
tok_new(void)
53
{
54
struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55
sizeof(struct tok_state));
56
if (tok == NULL)
57
return NULL;
58
tok->buf = tok->cur = tok->inp = NULL;
59
tok->fp_interactive = 0;
60
tok->interactive_src_start = NULL;
61
tok->interactive_src_end = NULL;
62
tok->start = NULL;
63
tok->end = NULL;
64
tok->done = E_OK;
65
tok->fp = NULL;
66
tok->input = NULL;
67
tok->tabsize = TABSIZE;
68
tok->indent = 0;
69
tok->indstack[0] = 0;
70
tok->atbol = 1;
71
tok->pendin = 0;
72
tok->prompt = tok->nextprompt = NULL;
73
tok->lineno = 0;
74
tok->level = 0;
75
tok->altindstack[0] = 0;
76
tok->decoding_state = STATE_INIT;
77
tok->decoding_erred = 0;
78
tok->enc = NULL;
79
tok->encoding = NULL;
80
tok->cont_line = 0;
81
tok->filename = NULL;
82
tok->decoding_readline = NULL;
83
tok->decoding_buffer = NULL;
84
tok->type_comments = 0;
85
tok->async_hacks = 0;
86
tok->async_def = 0;
87
tok->async_def_indent = 0;
88
tok->async_def_nl = 0;
89
tok->interactive_underflow = IUNDERFLOW_NORMAL;
90
91
return tok;
92
}
93
94
static char *
95
new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
96
{
97
char* result = (char *)PyMem_Malloc(len + 1);
98
if (!result) {
99
tok->done = E_NOMEM;
100
return NULL;
101
}
102
memcpy(result, s, len);
103
result[len] = '\0';
104
return result;
105
}
106
107
static char *
108
error_ret(struct tok_state *tok) /* XXX */
109
{
110
tok->decoding_erred = 1;
111
if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
112
PyMem_Free(tok->buf);
113
tok->buf = tok->cur = tok->inp = NULL;
114
tok->start = NULL;
115
tok->end = NULL;
116
tok->done = E_DECODE;
117
return NULL; /* as if it were EOF */
118
}
119
120
121
static const char *
122
get_normal_name(const char *s) /* for utf-8 and latin-1 */
123
{
124
char buf[13];
125
int i;
126
for (i = 0; i < 12; i++) {
127
int c = s[i];
128
if (c == '\0')
129
break;
130
else if (c == '_')
131
buf[i] = '-';
132
else
133
buf[i] = tolower(c);
134
}
135
buf[i] = '\0';
136
if (strcmp(buf, "utf-8") == 0 ||
137
strncmp(buf, "utf-8-", 6) == 0)
138
return "utf-8";
139
else if (strcmp(buf, "latin-1") == 0 ||
140
strcmp(buf, "iso-8859-1") == 0 ||
141
strcmp(buf, "iso-latin-1") == 0 ||
142
strncmp(buf, "latin-1-", 8) == 0 ||
143
strncmp(buf, "iso-8859-1-", 11) == 0 ||
144
strncmp(buf, "iso-latin-1-", 12) == 0)
145
return "iso-8859-1";
146
else
147
return s;
148
}
149
150
/* Return the coding spec in S, or NULL if none is found. */
151
152
static int
153
get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
154
{
155
Py_ssize_t i;
156
*spec = NULL;
157
/* Coding spec must be in a comment, and that comment must be
158
* the only statement on the source code line. */
159
for (i = 0; i < size - 6; i++) {
160
if (s[i] == '#')
161
break;
162
if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
163
return 1;
164
}
165
for (; i < size - 6; i++) { /* XXX inefficient search */
166
const char* t = s + i;
167
if (memcmp(t, "coding", 6) == 0) {
168
const char* begin = NULL;
169
t += 6;
170
if (t[0] != ':' && t[0] != '=')
171
continue;
172
do {
173
t++;
174
} while (t[0] == ' ' || t[0] == '\t');
175
176
begin = t;
177
while (Py_ISALNUM(t[0]) ||
178
t[0] == '-' || t[0] == '_' || t[0] == '.')
179
t++;
180
181
if (begin < t) {
182
char* r = new_string(begin, t - begin, tok);
183
const char* q;
184
if (!r)
185
return 0;
186
q = get_normal_name(r);
187
if (r != q) {
188
PyMem_Free(r);
189
r = new_string(q, strlen(q), tok);
190
if (!r)
191
return 0;
192
}
193
*spec = r;
194
break;
195
}
196
}
197
}
198
return 1;
199
}
200
201
/* Check whether the line contains a coding spec. If it does,
202
invoke the set_readline function for the new encoding.
203
This function receives the tok_state and the new encoding.
204
Return 1 on success, 0 on failure. */
205
206
static int
207
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
208
int set_readline(struct tok_state *, const char *))
209
{
210
char *cs;
211
if (tok->cont_line) {
212
/* It's a continuation line, so it can't be a coding spec. */
213
tok->decoding_state = STATE_NORMAL;
214
return 1;
215
}
216
if (!get_coding_spec(line, &cs, size, tok)) {
217
return 0;
218
}
219
if (!cs) {
220
Py_ssize_t i;
221
for (i = 0; i < size; i++) {
222
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223
break;
224
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225
/* Stop checking coding spec after a line containing
226
* anything except a comment. */
227
tok->decoding_state = STATE_NORMAL;
228
break;
229
}
230
}
231
return 1;
232
}
233
tok->decoding_state = STATE_NORMAL;
234
if (tok->encoding == NULL) {
235
assert(tok->decoding_readline == NULL);
236
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
237
error_ret(tok);
238
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
239
PyMem_Free(cs);
240
return 0;
241
}
242
tok->encoding = cs;
243
} else { /* then, compare cs with BOM */
244
if (strcmp(tok->encoding, cs) != 0) {
245
error_ret(tok);
246
PyErr_Format(PyExc_SyntaxError,
247
"encoding problem: %s with BOM", cs);
248
PyMem_Free(cs);
249
return 0;
250
}
251
PyMem_Free(cs);
252
}
253
return 1;
254
}
255
256
/* See whether the file starts with a BOM. If it does,
257
invoke the set_readline function with the new encoding.
258
Return 1 on success, 0 on failure. */
259
260
static int
261
check_bom(int get_char(struct tok_state *),
262
void unget_char(int, struct tok_state *),
263
int set_readline(struct tok_state *, const char *),
264
struct tok_state *tok)
265
{
266
int ch1, ch2, ch3;
267
ch1 = get_char(tok);
268
tok->decoding_state = STATE_SEEK_CODING;
269
if (ch1 == EOF) {
270
return 1;
271
} else if (ch1 == 0xEF) {
272
ch2 = get_char(tok);
273
if (ch2 != 0xBB) {
274
unget_char(ch2, tok);
275
unget_char(ch1, tok);
276
return 1;
277
}
278
ch3 = get_char(tok);
279
if (ch3 != 0xBF) {
280
unget_char(ch3, tok);
281
unget_char(ch2, tok);
282
unget_char(ch1, tok);
283
return 1;
284
}
285
#if 0
286
/* Disable support for UTF-16 BOMs until a decision
287
is made whether this needs to be supported. */
288
} else if (ch1 == 0xFE) {
289
ch2 = get_char(tok);
290
if (ch2 != 0xFF) {
291
unget_char(ch2, tok);
292
unget_char(ch1, tok);
293
return 1;
294
}
295
if (!set_readline(tok, "utf-16-be"))
296
return 0;
297
tok->decoding_state = STATE_NORMAL;
298
} else if (ch1 == 0xFF) {
299
ch2 = get_char(tok);
300
if (ch2 != 0xFE) {
301
unget_char(ch2, tok);
302
unget_char(ch1, tok);
303
return 1;
304
}
305
if (!set_readline(tok, "utf-16-le"))
306
return 0;
307
tok->decoding_state = STATE_NORMAL;
308
#endif
309
} else {
310
unget_char(ch1, tok);
311
return 1;
312
}
313
if (tok->encoding != NULL)
314
PyMem_Free(tok->encoding);
315
tok->encoding = new_string("utf-8", 5, tok);
316
if (!tok->encoding)
317
return 0;
318
/* No need to set_readline: input is already utf-8 */
319
return 1;
320
}
321
322
static int
323
tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
324
assert(tok->fp_interactive);
325
326
if (!line) {
327
return 0;
328
}
329
330
Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
331
Py_ssize_t line_size = strlen(line);
332
char* new_str = tok->interactive_src_start;
333
334
new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
335
if (!new_str) {
336
if (tok->interactive_src_start) {
337
PyMem_Free(tok->interactive_src_start);
338
}
339
tok->interactive_src_start = NULL;
340
tok->interactive_src_end = NULL;
341
tok->done = E_NOMEM;
342
return -1;
343
}
344
strcpy(new_str + current_size, line);
345
346
tok->interactive_src_start = new_str;
347
tok->interactive_src_end = new_str + current_size + line_size;
348
return 0;
349
}
350
351
352
/* Read a line of text from TOK into S, using the stream in TOK.
353
Return NULL on failure, else S.
354
355
On entry, tok->decoding_buffer will be one of:
356
1) NULL: need to call tok->decoding_readline to get a new line
357
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
358
stored the result in tok->decoding_buffer
359
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
360
(in the s buffer) to copy entire contents of the line read
361
by tok->decoding_readline. tok->decoding_buffer has the overflow.
362
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
363
until the buffer ends with a '\n' (or until the end of the file is
364
reached): see tok_nextc and its calls to tok_reserve_buf.
365
*/
366
367
static int
368
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
369
{
370
Py_ssize_t cur = tok->cur - tok->buf;
371
Py_ssize_t oldsize = tok->inp - tok->buf;
372
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
373
if (newsize > tok->end - tok->buf) {
374
char *newbuf = tok->buf;
375
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
376
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
377
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
378
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
379
if (newbuf == NULL) {
380
tok->done = E_NOMEM;
381
return 0;
382
}
383
tok->buf = newbuf;
384
tok->cur = tok->buf + cur;
385
tok->inp = tok->buf + oldsize;
386
tok->end = tok->buf + newsize;
387
tok->start = start < 0 ? NULL : tok->buf + start;
388
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
389
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
390
}
391
return 1;
392
}
393
394
static int
395
tok_readline_recode(struct tok_state *tok) {
396
PyObject *line;
397
const char *buf;
398
Py_ssize_t buflen;
399
line = tok->decoding_buffer;
400
if (line == NULL) {
401
line = PyObject_CallNoArgs(tok->decoding_readline);
402
if (line == NULL) {
403
error_ret(tok);
404
goto error;
405
}
406
}
407
else {
408
tok->decoding_buffer = NULL;
409
}
410
buf = PyUnicode_AsUTF8AndSize(line, &buflen);
411
if (buf == NULL) {
412
error_ret(tok);
413
goto error;
414
}
415
if (!tok_reserve_buf(tok, buflen + 1)) {
416
goto error;
417
}
418
memcpy(tok->inp, buf, buflen);
419
tok->inp += buflen;
420
*tok->inp = '\0';
421
if (tok->fp_interactive &&
422
tok_concatenate_interactive_new_line(tok, buf) == -1) {
423
goto error;
424
}
425
Py_DECREF(line);
426
return 1;
427
error:
428
Py_XDECREF(line);
429
return 0;
430
}
431
432
/* Set the readline function for TOK to a StreamReader's
433
readline function. The StreamReader is named ENC.
434
435
This function is called from check_bom and check_coding_spec.
436
437
ENC is usually identical to the future value of tok->encoding,
438
except for the (currently unsupported) case of UTF-16.
439
440
Return 1 on success, 0 on failure. */
441
442
static int
443
fp_setreadl(struct tok_state *tok, const char* enc)
444
{
445
PyObject *readline, *io, *stream;
446
_Py_IDENTIFIER(open);
447
_Py_IDENTIFIER(readline);
448
int fd;
449
long pos;
450
451
fd = fileno(tok->fp);
452
/* Due to buffering the file offset for fd can be different from the file
453
* position of tok->fp. If tok->fp was opened in text mode on Windows,
454
* its file position counts CRLF as one char and can't be directly mapped
455
* to the file offset for fd. Instead we step back one byte and read to
456
* the end of line.*/
457
pos = ftell(tok->fp);
458
if (pos == -1 ||
459
lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
460
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
461
return 0;
462
}
463
464
io = PyImport_ImportModuleNoBlock("io");
465
if (io == NULL)
466
return 0;
467
468
stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
469
fd, "r", -1, enc, Py_None, Py_None, Py_False);
470
Py_DECREF(io);
471
if (stream == NULL)
472
return 0;
473
474
readline = _PyObject_GetAttrId(stream, &PyId_readline);
475
Py_DECREF(stream);
476
if (readline == NULL)
477
return 0;
478
Py_XSETREF(tok->decoding_readline, readline);
479
480
if (pos > 0) {
481
PyObject *bufobj = _PyObject_CallNoArgs(readline);
482
if (bufobj == NULL)
483
return 0;
484
Py_DECREF(bufobj);
485
}
486
487
return 1;
488
}
489
490
/* Fetch the next byte from TOK. */
491
492
static int fp_getc(struct tok_state *tok) {
493
return getc(tok->fp);
494
}
495
496
/* Unfetch the last byte back into TOK. */
497
498
static void fp_ungetc(int c, struct tok_state *tok) {
499
ungetc(c, tok->fp);
500
}
501
502
/* Check whether the characters at s start a valid
503
UTF-8 sequence. Return the number of characters forming
504
the sequence if yes, 0 if not. */
505
static int valid_utf8(const unsigned char* s)
506
{
507
int expected = 0;
508
int length;
509
if (*s < 0x80)
510
/* single-byte code */
511
return 1;
512
if (*s < 0xc0)
513
/* following byte */
514
return 0;
515
if (*s < 0xE0)
516
expected = 1;
517
else if (*s < 0xF0)
518
expected = 2;
519
else if (*s < 0xF8)
520
expected = 3;
521
else
522
return 0;
523
length = expected + 1;
524
for (; expected; expected--)
525
if (s[expected] < 0x80 || s[expected] >= 0xC0)
526
return 0;
527
return length;
528
}
529
530
static int
531
ensure_utf8(char *line, struct tok_state *tok)
532
{
533
int badchar = 0;
534
unsigned char *c;
535
int length;
536
for (c = (unsigned char *)line; *c; c += length) {
537
if (!(length = valid_utf8(c))) {
538
badchar = *c;
539
break;
540
}
541
}
542
if (badchar) {
543
/* Need to add 1 to the line number, since this line
544
has not been counted, yet. */
545
PyErr_Format(PyExc_SyntaxError,
546
"Non-UTF-8 code starting with '\\x%.2x' "
547
"in file %U on line %i, "
548
"but no encoding declared; "
549
"see https://python.org/dev/peps/pep-0263/ for details",
550
badchar, tok->filename, tok->lineno + 1);
551
return 0;
552
}
553
return 1;
554
}
555
556
/* Fetch a byte from TOK, using the string buffer. */
557
558
static int
559
buf_getc(struct tok_state *tok) {
560
return Py_CHARMASK(*tok->str++);
561
}
562
563
/* Unfetch a byte from TOK, using the string buffer. */
564
565
static void
566
buf_ungetc(int c, struct tok_state *tok) {
567
tok->str--;
568
assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
569
}
570
571
/* Set the readline function for TOK to ENC. For the string-based
572
tokenizer, this means to just record the encoding. */
573
574
static int
575
buf_setreadl(struct tok_state *tok, const char* enc) {
576
tok->enc = enc;
577
return 1;
578
}
579
580
/* Return a UTF-8 encoding Python string object from the
581
C byte string STR, which is encoded with ENC. */
582
583
static PyObject *
584
translate_into_utf8(const char* str, const char* enc) {
585
PyObject *utf8;
586
PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
587
if (buf == NULL)
588
return NULL;
589
utf8 = PyUnicode_AsUTF8String(buf);
590
Py_DECREF(buf);
591
return utf8;
592
}
593
594
595
static char *
596
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
597
int skip_next_lf = 0;
598
size_t needed_length = strlen(s) + 2, final_length;
599
char *buf, *current;
600
char c = '\0';
601
buf = PyMem_Malloc(needed_length);
602
if (buf == NULL) {
603
tok->done = E_NOMEM;
604
return NULL;
605
}
606
for (current = buf; *s; s++, current++) {
607
c = *s;
608
if (skip_next_lf) {
609
skip_next_lf = 0;
610
if (c == '\n') {
611
c = *++s;
612
if (!c)
613
break;
614
}
615
}
616
if (c == '\r') {
617
skip_next_lf = 1;
618
c = '\n';
619
}
620
*current = c;
621
}
622
/* If this is exec input, add a newline to the end of the string if
623
there isn't one already. */
624
if (exec_input && c != '\n') {
625
*current = '\n';
626
current++;
627
}
628
*current = '\0';
629
final_length = current - buf + 1;
630
if (final_length < needed_length && final_length) {
631
/* should never fail */
632
char* result = PyMem_Realloc(buf, final_length);
633
if (result == NULL) {
634
PyMem_Free(buf);
635
}
636
buf = result;
637
}
638
return buf;
639
}
640
641
/* Decode a byte string STR for use as the buffer of TOK.
642
Look for encoding declarations inside STR, and record them
643
inside TOK. */
644
645
static char *
646
decode_str(const char *input, int single, struct tok_state *tok)
647
{
648
PyObject* utf8 = NULL;
649
char *str;
650
const char *s;
651
const char *newl[2] = {NULL, NULL};
652
int lineno = 0;
653
tok->input = str = translate_newlines(input, single, tok);
654
if (str == NULL)
655
return NULL;
656
tok->enc = NULL;
657
tok->str = str;
658
if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
659
return error_ret(tok);
660
str = tok->str; /* string after BOM if any */
661
assert(str);
662
if (tok->enc != NULL) {
663
utf8 = translate_into_utf8(str, tok->enc);
664
if (utf8 == NULL)
665
return error_ret(tok);
666
str = PyBytes_AsString(utf8);
667
}
668
for (s = str;; s++) {
669
if (*s == '\0') break;
670
else if (*s == '\n') {
671
assert(lineno < 2);
672
newl[lineno] = s;
673
lineno++;
674
if (lineno == 2) break;
675
}
676
}
677
tok->enc = NULL;
678
/* need to check line 1 and 2 separately since check_coding_spec
679
assumes a single line as input */
680
if (newl[0]) {
681
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
682
return NULL;
683
}
684
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
685
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
686
tok, buf_setreadl))
687
return NULL;
688
}
689
}
690
if (tok->enc != NULL) {
691
assert(utf8 == NULL);
692
utf8 = translate_into_utf8(str, tok->enc);
693
if (utf8 == NULL)
694
return error_ret(tok);
695
str = PyBytes_AS_STRING(utf8);
696
}
697
assert(tok->decoding_buffer == NULL);
698
tok->decoding_buffer = utf8; /* CAUTION */
699
return str;
700
}
701
702
/* Set up tokenizer for string */
703
704
struct tok_state *
705
_PyTokenizer_FromString(const char *str, int exec_input)
706
{
707
struct tok_state *tok = tok_new();
708
char *decoded;
709
710
if (tok == NULL)
711
return NULL;
712
decoded = decode_str(str, exec_input, tok);
713
if (decoded == NULL) {
714
_PyTokenizer_Free(tok);
715
return NULL;
716
}
717
718
tok->buf = tok->cur = tok->inp = decoded;
719
tok->end = decoded;
720
return tok;
721
}
722
723
/* Set up tokenizer for UTF-8 string */
724
725
struct tok_state *
726
_PyTokenizer_FromUTF8(const char *str, int exec_input)
727
{
728
struct tok_state *tok = tok_new();
729
char *translated;
730
if (tok == NULL)
731
return NULL;
732
tok->input = translated = translate_newlines(str, exec_input, tok);
733
if (translated == NULL) {
734
_PyTokenizer_Free(tok);
735
return NULL;
736
}
737
tok->decoding_state = STATE_NORMAL;
738
tok->enc = NULL;
739
tok->str = translated;
740
tok->encoding = new_string("utf-8", 5, tok);
741
if (!tok->encoding) {
742
_PyTokenizer_Free(tok);
743
return NULL;
744
}
745
746
tok->buf = tok->cur = tok->inp = translated;
747
tok->end = translated;
748
return tok;
749
}
750
751
/* Set up tokenizer for file */
752
753
struct tok_state *
754
_PyTokenizer_FromFile(FILE *fp, const char* enc,
755
const char *ps1, const char *ps2)
756
{
757
struct tok_state *tok = tok_new();
758
if (tok == NULL)
759
return NULL;
760
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
761
_PyTokenizer_Free(tok);
762
return NULL;
763
}
764
tok->cur = tok->inp = tok->buf;
765
tok->end = tok->buf + BUFSIZ;
766
tok->fp = fp;
767
tok->prompt = ps1;
768
tok->nextprompt = ps2;
769
if (enc != NULL) {
770
/* Must copy encoding declaration since it
771
gets copied into the parse tree. */
772
tok->encoding = new_string(enc, strlen(enc), tok);
773
if (!tok->encoding) {
774
_PyTokenizer_Free(tok);
775
return NULL;
776
}
777
tok->decoding_state = STATE_NORMAL;
778
}
779
return tok;
780
}
781
782
/* Free a tok_state structure */
783
784
void
785
_PyTokenizer_Free(struct tok_state *tok)
786
{
787
if (tok->encoding != NULL) {
788
PyMem_Free(tok->encoding);
789
}
790
Py_XDECREF(tok->decoding_readline);
791
Py_XDECREF(tok->decoding_buffer);
792
Py_XDECREF(tok->filename);
793
if (tok->fp != NULL && tok->buf != NULL) {
794
PyMem_Free(tok->buf);
795
}
796
if (tok->input) {
797
PyMem_Free(tok->input);
798
}
799
if (tok->interactive_src_start != NULL) {
800
PyMem_Free(tok->interactive_src_start);
801
}
802
PyMem_Free(tok);
803
}
804
805
static int
806
tok_readline_raw(struct tok_state *tok)
807
{
808
do {
809
if (!tok_reserve_buf(tok, BUFSIZ)) {
810
return 0;
811
}
812
char *line = Py_UniversalNewlineFgets(tok->inp,
813
(int)(tok->end - tok->inp),
814
tok->fp, NULL);
815
if (line == NULL) {
816
return 1;
817
}
818
if (tok->fp_interactive &&
819
tok_concatenate_interactive_new_line(tok, line) == -1) {
820
return 0;
821
}
822
if (*tok->inp == '\0') {
823
return 0;
824
}
825
tok->inp = strchr(tok->inp, '\0');
826
} while (tok->inp[-1] != '\n');
827
return 1;
828
}
829
830
static int
831
tok_underflow_string(struct tok_state *tok) {
832
char *end = strchr(tok->inp, '\n');
833
if (end != NULL) {
834
end++;
835
}
836
else {
837
end = strchr(tok->inp, '\0');
838
if (end == tok->inp) {
839
tok->done = E_EOF;
840
return 0;
841
}
842
}
843
if (tok->start == NULL) {
844
tok->buf = tok->cur;
845
}
846
tok->line_start = tok->cur;
847
tok->lineno++;
848
tok->inp = end;
849
return 1;
850
}
851
852
static int
853
tok_underflow_interactive(struct tok_state *tok) {
854
if (tok->interactive_underflow == IUNDERFLOW_STOP) {
855
tok->done = E_INTERACT_STOP;
856
return 1;
857
}
858
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
859
if (newtok != NULL) {
860
char *translated = translate_newlines(newtok, 0, tok);
861
PyMem_Free(newtok);
862
if (translated == NULL) {
863
return 0;
864
}
865
newtok = translated;
866
}
867
if (tok->encoding && newtok && *newtok) {
868
/* Recode to UTF-8 */
869
Py_ssize_t buflen;
870
const char* buf;
871
PyObject *u = translate_into_utf8(newtok, tok->encoding);
872
PyMem_Free(newtok);
873
if (u == NULL) {
874
tok->done = E_DECODE;
875
return 0;
876
}
877
buflen = PyBytes_GET_SIZE(u);
878
buf = PyBytes_AS_STRING(u);
879
newtok = PyMem_Malloc(buflen+1);
880
if (newtok == NULL) {
881
Py_DECREF(u);
882
tok->done = E_NOMEM;
883
return 0;
884
}
885
strcpy(newtok, buf);
886
Py_DECREF(u);
887
}
888
if (tok->fp_interactive &&
889
tok_concatenate_interactive_new_line(tok, newtok) == -1) {
890
PyMem_Free(newtok);
891
return 0;
892
}
893
if (tok->nextprompt != NULL) {
894
tok->prompt = tok->nextprompt;
895
}
896
if (newtok == NULL) {
897
tok->done = E_INTR;
898
}
899
else if (*newtok == '\0') {
900
PyMem_Free(newtok);
901
tok->done = E_EOF;
902
}
903
else if (tok->start != NULL) {
904
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
905
size_t size = strlen(newtok);
906
tok->lineno++;
907
if (!tok_reserve_buf(tok, size + 1)) {
908
PyMem_Free(tok->buf);
909
tok->buf = NULL;
910
PyMem_Free(newtok);
911
return 0;
912
}
913
memcpy(tok->cur, newtok, size + 1);
914
PyMem_Free(newtok);
915
tok->inp += size;
916
tok->multi_line_start = tok->buf + cur_multi_line_start;
917
}
918
else {
919
tok->lineno++;
920
PyMem_Free(tok->buf);
921
tok->buf = newtok;
922
tok->cur = tok->buf;
923
tok->line_start = tok->buf;
924
tok->inp = strchr(tok->buf, '\0');
925
tok->end = tok->inp + 1;
926
}
927
if (tok->done != E_OK) {
928
if (tok->prompt != NULL) {
929
PySys_WriteStderr("\n");
930
}
931
return 0;
932
}
933
return 1;
934
}
935
936
static int
937
tok_underflow_file(struct tok_state *tok) {
938
if (tok->start == NULL) {
939
tok->cur = tok->inp = tok->buf;
940
}
941
if (tok->decoding_state == STATE_INIT) {
942
/* We have not yet determined the encoding.
943
If an encoding is found, use the file-pointer
944
reader functions from now on. */
945
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
946
error_ret(tok);
947
return 0;
948
}
949
assert(tok->decoding_state != STATE_INIT);
950
}
951
/* Read until '\n' or EOF */
952
if (tok->decoding_readline != NULL) {
953
/* We already have a codec associated with this input. */
954
if (!tok_readline_recode(tok)) {
955
return 0;
956
}
957
}
958
else {
959
/* We want a 'raw' read. */
960
if (!tok_readline_raw(tok)) {
961
return 0;
962
}
963
}
964
if (tok->inp == tok->cur) {
965
tok->done = E_EOF;
966
return 0;
967
}
968
if (tok->inp[-1] != '\n') {
969
/* Last line does not end in \n, fake one */
970
*tok->inp++ = '\n';
971
*tok->inp = '\0';
972
}
973
974
tok->lineno++;
975
if (tok->decoding_state != STATE_NORMAL) {
976
if (tok->lineno > 2) {
977
tok->decoding_state = STATE_NORMAL;
978
}
979
else if (!check_coding_spec(tok->cur, strlen(tok->cur),
980
tok, fp_setreadl))
981
{
982
return 0;
983
}
984
}
985
/* The default encoding is UTF-8, so make sure we don't have any
986
non-UTF-8 sequences in it. */
987
if (!tok->encoding
988
&& (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
989
if (!ensure_utf8(tok->cur, tok)) {
990
error_ret(tok);
991
return 0;
992
}
993
}
994
assert(tok->done == E_OK);
995
return tok->done == E_OK;
996
}
997
998
static void
999
print_escape(FILE *f, const char *s, Py_ssize_t size)
1000
{