Skip to content

Latest commit

 

History

History
2132 lines (1976 loc) · 61.5 KB

File metadata and controls

2132 lines (1976 loc) · 61.5 KB
 
Feb 19, 1991
Feb 19, 1991
1
Oct 14, 1990
Oct 14, 1990
2
/* Tokenizer implementation */
3
Feb 12, 2020
Feb 12, 2020
4
#define PY_SSIZE_T_CLEAN
Apr 14, 2002
Apr 14, 2002
5
#include "Python.h"
Oct 12, 2021
Oct 12, 2021
6
#include "pycore_call.h" // _PyObject_CallNoArgs()
Oct 14, 1990
Oct 14, 1990
7
8
#include <ctype.h>
Aug 4, 2002
Aug 4, 2002
9
#include <assert.h>
Oct 14, 1990
Oct 14, 1990
10
11
#include "tokenizer.h"
12
#include "errcode.h"
13
Aug 4, 2002
Aug 4, 2002
14
#include "unicodeobject.h"
May 26, 2008
May 26, 2008
15
#include "bytesobject.h"
Aug 4, 2002
Aug 4, 2002
16
#include "fileobject.h"
17
#include "abstract.h"
18
Nov 17, 2017
Nov 17, 2017
19
/* Alternate tab spacing */
20
#define ALTTABSIZE 1
21
Jun 10, 2007
Jun 10, 2007
22
#define is_potential_identifier_start(c) (\
May 9, 2010
May 9, 2010
23
(c >= 'a' && c <= 'z')\
24
|| (c >= 'A' && c <= 'Z')\
25
|| c == '_'\
26
|| (c >= 128))
Jun 10, 2007
Jun 10, 2007
27
28
#define is_potential_identifier_char(c) (\
May 9, 2010
May 9, 2010
29
(c >= 'a' && c <= 'z')\
30
|| (c >= 'A' && c <= 'Z')\
31
|| (c >= '0' && c <= '9')\
32
|| c == '_'\
33
|| (c >= 128))
Jun 10, 2007
Jun 10, 2007
34
Aug 29, 1994
Aug 29, 1994
35
Feb 26, 1992
Feb 26, 1992
36
/* Don't ever change this -- it would break the portability of Python code */
Oct 14, 1990
Oct 14, 1990
37
#define TABSIZE 8
38
Dec 20, 1990
Dec 20, 1990
39
/* Forward */
Jul 9, 2000
Jul 9, 2000
40
static struct tok_state *tok_new(void);
41
static int tok_nextc(struct tok_state *tok);
42
static void tok_backup(struct tok_state *tok, int c);
Dec 20, 1990
Dec 20, 1990
43
Oct 20, 2007
Oct 20, 2007
44
Jan 31, 2019
Jan 31, 2019
45
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
46
tokenizing. */
47
static const char* type_comment_prefix = "# type: ";
48
Oct 14, 1990
Oct 14, 1990
49
/* Create and initialize a new tok_state structure */
50
51
static struct tok_state *
Jul 22, 2000
Jul 22, 2000
52
tok_new(void)
Oct 14, 1990
Oct 14, 1990
53
{
Dec 1, 2020
Dec 1, 2020
54
struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
May 9, 2010
May 9, 2010
55
sizeof(struct tok_state));
56
if (tok == NULL)
57
return NULL;
Feb 28, 2020
Feb 28, 2020
58
tok->buf = tok->cur = tok->inp = NULL;
Mar 14, 2021
Mar 14, 2021
59
tok->fp_interactive = 0;
60
tok->interactive_src_start = NULL;
61
tok->interactive_src_end = NULL;
Feb 28, 2020
Feb 28, 2020
62
tok->start = NULL;
63
tok->end = NULL;
May 9, 2010
May 9, 2010
64
tok->done = E_OK;
65
tok->fp = NULL;
66
tok->input = NULL;
67
tok->tabsize = TABSIZE;
68
tok->indent = 0;
69
tok->indstack[0] = 0;
70
tok->atbol = 1;
71
tok->pendin = 0;
72
tok->prompt = tok->nextprompt = NULL;
73
tok->lineno = 0;
74
tok->level = 0;
75
tok->altindstack[0] = 0;
76
tok->decoding_state = STATE_INIT;
77
tok->decoding_erred = 0;
78
tok->enc = NULL;
79
tok->encoding = NULL;
80
tok->cont_line = 0;
Apr 4, 2011
Apr 4, 2011
81
tok->filename = NULL;
May 9, 2010
May 9, 2010
82
tok->decoding_readline = NULL;
83
tok->decoding_buffer = NULL;
Jan 31, 2019
Jan 31, 2019
84
tok->type_comments = 0;
Mar 7, 2019
Mar 7, 2019
85
tok->async_hacks = 0;
86
tok->async_def = 0;
87
tok->async_def_indent = 0;
88
tok->async_def_nl = 0;
May 22, 2021
May 22, 2021
89
tok->interactive_underflow = IUNDERFLOW_NORMAL;
Mar 7, 2019
Mar 7, 2019
90
May 9, 2010
May 9, 2010
91
return tok;
Oct 14, 1990
Oct 14, 1990
92
}
93
Nov 13, 2009
Nov 13, 2009
94
static char *
Jul 16, 2013
Jul 16, 2013
95
new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Nov 13, 2009
Nov 13, 2009
96
{
Dec 1, 2020
Dec 1, 2020
97
char* result = (char *)PyMem_Malloc(len + 1);
Jul 16, 2013
Jul 16, 2013
98
if (!result) {
99
tok->done = E_NOMEM;
100
return NULL;
May 9, 2010
May 9, 2010
101
}
Jul 16, 2013
Jul 16, 2013
102
memcpy(result, s, len);
103
result[len] = '\0';
May 9, 2010
May 9, 2010
104
return result;
Nov 13, 2009
Nov 13, 2009
105
}
106
Aug 4, 2002
Aug 4, 2002
107
static char *
108
error_ret(struct tok_state *tok) /* XXX */
109
{
May 9, 2010
May 9, 2010
110
tok->decoding_erred = 1;
Oct 13, 2021
Oct 13, 2021
111
if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
Dec 1, 2020
Dec 1, 2020
112
PyMem_Free(tok->buf);
Feb 28, 2020
Feb 28, 2020
113
tok->buf = tok->cur = tok->inp = NULL;
114
tok->start = NULL;
115
tok->end = NULL;
Nov 14, 2015
Nov 14, 2015
116
tok->done = E_DECODE;
May 9, 2010
May 9, 2010
117
return NULL; /* as if it were EOF */
Aug 4, 2002
Aug 4, 2002
118
}
119
120
Dec 25, 2015
Dec 25, 2015
121
static const char *
122
get_normal_name(const char *s) /* for utf-8 and latin-1 */
Aug 4, 2002
Aug 4, 2002
123
{
May 9, 2010
May 9, 2010
124
char buf[13];
125
int i;
126
for (i = 0; i < 12; i++) {
127
int c = s[i];
128
if (c == '\0')
129
break;
130
else if (c == '_')
131
buf[i] = '-';
132
else
133
buf[i] = tolower(c);
134
}
135
buf[i] = '\0';
136
if (strcmp(buf, "utf-8") == 0 ||
137
strncmp(buf, "utf-8-", 6) == 0)
138
return "utf-8";
139
else if (strcmp(buf, "latin-1") == 0 ||
140
strcmp(buf, "iso-8859-1") == 0 ||
141
strcmp(buf, "iso-latin-1") == 0 ||
142
strncmp(buf, "latin-1-", 8) == 0 ||
143
strncmp(buf, "iso-8859-1-", 11) == 0 ||
144
strncmp(buf, "iso-latin-1-", 12) == 0)
145
return "iso-8859-1";
146
else
147
return s;
Aug 4, 2002
Aug 4, 2002
148
}
149
150
/* Return the coding spec in S, or NULL if none is found. */
151
Jul 16, 2013
Jul 16, 2013
152
static int
153
get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Aug 4, 2002
Aug 4, 2002
154
{
May 9, 2010
May 9, 2010
155
Py_ssize_t i;
Jul 16, 2013
Jul 16, 2013
156
*spec = NULL;
May 9, 2010
May 9, 2010
157
/* Coding spec must be in a comment, and that comment must be
158
* the only statement on the source code line. */
159
for (i = 0; i < size - 6; i++) {
160
if (s[i] == '#')
161
break;
162
if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Jul 16, 2013
Jul 16, 2013
163
return 1;
May 9, 2010
May 9, 2010
164
}
165
for (; i < size - 6; i++) { /* XXX inefficient search */
166
const char* t = s + i;
Mar 28, 2021
Mar 28, 2021
167
if (memcmp(t, "coding", 6) == 0) {
May 9, 2010
May 9, 2010
168
const char* begin = NULL;
169
t += 6;
170
if (t[0] != ':' && t[0] != '=')
171
continue;
172
do {
173
t++;
Mar 28, 2021
Mar 28, 2021
174
} while (t[0] == ' ' || t[0] == '\t');
May 9, 2010
May 9, 2010
175
176
begin = t;
177
while (Py_ISALNUM(t[0]) ||
178
t[0] == '-' || t[0] == '_' || t[0] == '.')
179
t++;
180
181
if (begin < t) {
Jul 16, 2013
Jul 16, 2013
182
char* r = new_string(begin, t - begin, tok);
Dec 25, 2015
Dec 25, 2015
183
const char* q;
Jul 16, 2013
Jul 16, 2013
184
if (!r)
185
return 0;
Jul 16, 2013
Jul 16, 2013
186
q = get_normal_name(r);
May 9, 2010
May 9, 2010
187
if (r != q) {
Dec 1, 2020
Dec 1, 2020
188
PyMem_Free(r);
Jul 16, 2013
Jul 16, 2013
189
r = new_string(q, strlen(q), tok);
190
if (!r)
191
return 0;
May 9, 2010
May 9, 2010
192
}
Jul 16, 2013
Jul 16, 2013
193
*spec = r;
Mar 20, 2016
Mar 20, 2016
194
break;
May 9, 2010
May 9, 2010
195
}
196
}
197
}
Jul 16, 2013
Jul 16, 2013
198
return 1;
Aug 4, 2002
Aug 4, 2002
199
}
200
201
/* Check whether the line contains a coding spec. If it does,
202
invoke the set_readline function for the new encoding.
203
This function receives the tok_state and the new encoding.
204
Return 1 on success, 0 on failure. */
205
206
static int
Feb 15, 2006
Feb 15, 2006
207
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
May 9, 2010
May 9, 2010
208
int set_readline(struct tok_state *, const char *))
Aug 4, 2002
Aug 4, 2002
209
{
Jul 16, 2013
Jul 16, 2013
210
char *cs;
Jan 9, 2014
Jan 9, 2014
211
if (tok->cont_line) {
May 9, 2010
May 9, 2010
212
/* It's a continuation line, so it can't be a coding spec. */
Mar 28, 2021
Mar 28, 2021
213
tok->decoding_state = STATE_NORMAL;
May 9, 2010
May 9, 2010
214
return 1;
Jan 9, 2014
Jan 9, 2014
215
}
Mar 28, 2021
Mar 28, 2021
216
if (!get_coding_spec(line, &cs, size, tok)) {
Jul 16, 2013
Jul 16, 2013
217
return 0;
Mar 28, 2021
Mar 28, 2021
218
}
Jan 9, 2014
Jan 9, 2014
219
if (!cs) {
220
Py_ssize_t i;
221
for (i = 0; i < size; i++) {
222
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223
break;
224
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225
/* Stop checking coding spec after a line containing
226
* anything except a comment. */
Mar 28, 2021
Mar 28, 2021
227
tok->decoding_state = STATE_NORMAL;
Jan 9, 2014
Jan 9, 2014
228
break;
229
}
230
}
Jul 16, 2013
Jul 16, 2013
231
return 1;
Jan 9, 2014
Jan 9, 2014
232
}
Mar 28, 2021
Mar 28, 2021
233
tok->decoding_state = STATE_NORMAL;
Jul 16, 2013
Jul 16, 2013
234
if (tok->encoding == NULL) {
Mar 28, 2021
Mar 28, 2021
235
assert(tok->decoding_readline == NULL);
236
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
237
error_ret(tok);
238
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
239
PyMem_Free(cs);
240
return 0;
May 9, 2010
May 9, 2010
241
}
Mar 28, 2021
Mar 28, 2021
242
tok->encoding = cs;
Jul 16, 2013
Jul 16, 2013
243
} else { /* then, compare cs with BOM */
Mar 28, 2021
Mar 28, 2021
244
if (strcmp(tok->encoding, cs) != 0) {
245
error_ret(tok);
Jul 16, 2013
Jul 16, 2013
246
PyErr_Format(PyExc_SyntaxError,
247
"encoding problem: %s with BOM", cs);
Mar 28, 2021
Mar 28, 2021
248
PyMem_Free(cs);
249
return 0;
250
}
Dec 1, 2020
Dec 1, 2020
251
PyMem_Free(cs);
May 9, 2010
May 9, 2010
252
}
Mar 28, 2021
Mar 28, 2021
253
return 1;
Aug 4, 2002
Aug 4, 2002
254
}
255
256
/* See whether the file starts with a BOM. If it does,
257
invoke the set_readline function with the new encoding.
258
Return 1 on success, 0 on failure. */
259
260
static int
261
check_bom(int get_char(struct tok_state *),
May 9, 2010
May 9, 2010
262
void unget_char(int, struct tok_state *),
263
int set_readline(struct tok_state *, const char *),
264
struct tok_state *tok)
Aug 4, 2002
Aug 4, 2002
265
{
May 9, 2010
May 9, 2010
266
int ch1, ch2, ch3;
267
ch1 = get_char(tok);
Mar 28, 2021
Mar 28, 2021
268
tok->decoding_state = STATE_SEEK_CODING;
May 9, 2010
May 9, 2010
269
if (ch1 == EOF) {
270
return 1;
271
} else if (ch1 == 0xEF) {
272
ch2 = get_char(tok);
273
if (ch2 != 0xBB) {
274
unget_char(ch2, tok);
275
unget_char(ch1, tok);
276
return 1;
277
}
278
ch3 = get_char(tok);
279
if (ch3 != 0xBF) {
280
unget_char(ch3, tok);
281
unget_char(ch2, tok);
282
unget_char(ch1, tok);
283
return 1;
284
}
Aug 4, 2002
Aug 4, 2002
285
#if 0
May 9, 2010
May 9, 2010
286
/* Disable support for UTF-16 BOMs until a decision
287
is made whether this needs to be supported. */
288
} else if (ch1 == 0xFE) {
289
ch2 = get_char(tok);
290
if (ch2 != 0xFF) {
291
unget_char(ch2, tok);
292
unget_char(ch1, tok);
293
return 1;
294
}
295
if (!set_readline(tok, "utf-16-be"))
296
return 0;
297
tok->decoding_state = STATE_NORMAL;
298
} else if (ch1 == 0xFF) {
299
ch2 = get_char(tok);
300
if (ch2 != 0xFE) {
301
unget_char(ch2, tok);
302
unget_char(ch1, tok);
303
return 1;
304
}
305
if (!set_readline(tok, "utf-16-le"))
306
return 0;
307
tok->decoding_state = STATE_NORMAL;
Aug 4, 2002
Aug 4, 2002
308
#endif
May 9, 2010
May 9, 2010
309
} else {
310
unget_char(ch1, tok);
311
return 1;
312
}
313
if (tok->encoding != NULL)
Dec 1, 2020
Dec 1, 2020
314
PyMem_Free(tok->encoding);
Jul 16, 2013
Jul 16, 2013
315
tok->encoding = new_string("utf-8", 5, tok);
316
if (!tok->encoding)
317
return 0;
May 9, 2010
May 9, 2010
318
/* No need to set_readline: input is already utf-8 */
319
return 1;
Aug 4, 2002
Aug 4, 2002
320
}
321
Mar 28, 2021
Mar 28, 2021
322
static int
323
tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
Mar 14, 2021
Mar 14, 2021
324
assert(tok->fp_interactive);
325
326
if (!line) {
327
return 0;
328
}
329
330
Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
331
Py_ssize_t line_size = strlen(line);
332
char* new_str = tok->interactive_src_start;
333
334
new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
335
if (!new_str) {
336
if (tok->interactive_src_start) {
337
PyMem_Free(tok->interactive_src_start);
338
}
339
tok->interactive_src_start = NULL;
340
tok->interactive_src_end = NULL;
341
tok->done = E_NOMEM;
342
return -1;
343
}
344
strcpy(new_str + current_size, line);
345
346
tok->interactive_src_start = new_str;
347
tok->interactive_src_end = new_str + current_size + line_size;
348
return 0;
349
}
350
351
Aug 4, 2002
Aug 4, 2002
352
/* Read a line of text from TOK into S, using the stream in TOK.
Jul 12, 2005
Jul 12, 2005
353
Return NULL on failure, else S.
Apr 21, 2006
Apr 21, 2006
354
Jul 12, 2005
Jul 12, 2005
355
On entry, tok->decoding_buffer will be one of:
356
1) NULL: need to call tok->decoding_readline to get a new line
357
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
May 9, 2010
May 9, 2010
358
stored the result in tok->decoding_buffer
Mar 28, 2021
Mar 28, 2021
359
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
May 9, 2010
May 9, 2010
360
(in the s buffer) to copy entire contents of the line read
361
by tok->decoding_readline. tok->decoding_buffer has the overflow.
Mar 28, 2021
Mar 28, 2021
362
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
May 9, 2010
May 9, 2010
363
until the buffer ends with a '\n' (or until the end of the file is
Mar 28, 2021
Mar 28, 2021
364
reached): see tok_nextc and its calls to tok_reserve_buf.
Jul 12, 2005
Jul 12, 2005
365
*/
Aug 4, 2002
Aug 4, 2002
366
Mar 28, 2021
Mar 28, 2021
367
static int
368
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Aug 4, 2002
Aug 4, 2002
369
{
Mar 28, 2021
Mar 28, 2021
370
Py_ssize_t cur = tok->cur - tok->buf;
371
Py_ssize_t oldsize = tok->inp - tok->buf;
372
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
373
if (newsize > tok->end - tok->buf) {
374
char *newbuf = tok->buf;
375
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Jun 12, 2021
Jun 12, 2021
376
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
377
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
Mar 28, 2021
Mar 28, 2021
378
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
379
if (newbuf == NULL) {
380
tok->done = E_NOMEM;
381
return 0;
May 9, 2010
May 9, 2010
382
}
Mar 28, 2021
Mar 28, 2021
383
tok->buf = newbuf;
384
tok->cur = tok->buf + cur;
385
tok->inp = tok->buf + oldsize;
386
tok->end = tok->buf + newsize;
387
tok->start = start < 0 ? NULL : tok->buf + start;
Jun 12, 2021
Jun 12, 2021
388
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
389
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
May 9, 2010
May 9, 2010
390
}
Mar 28, 2021
Mar 28, 2021
391
return 1;
392
}
May 9, 2010
May 9, 2010
393
Mar 28, 2021
Mar 28, 2021
394
static int
395
tok_readline_recode(struct tok_state *tok) {
396
PyObject *line;
397
const char *buf;
398
Py_ssize_t buflen;
399
line = tok->decoding_buffer;
400
if (line == NULL) {
401
line = PyObject_CallNoArgs(tok->decoding_readline);
402
if (line == NULL) {
403
error_ret(tok);
May 9, 2010
May 9, 2010
404
goto error;
Mar 28, 2021
Mar 28, 2021
405
}
May 9, 2010
May 9, 2010
406
}
Mar 28, 2021
Mar 28, 2021
407
else {
May 9, 2010
May 9, 2010
408
tok->decoding_buffer = NULL;
Mar 28, 2021
Mar 28, 2021
409
}
410
buf = PyUnicode_AsUTF8AndSize(line, &buflen);
411
if (buf == NULL) {
412
error_ret(tok);
413
goto error;
414
}
415
if (!tok_reserve_buf(tok, buflen + 1)) {
416
goto error;
417
}
418
memcpy(tok->inp, buf, buflen);
419
tok->inp += buflen;
420
*tok->inp = '\0';
421
if (tok->fp_interactive &&
422
tok_concatenate_interactive_new_line(tok, buf) == -1) {
423
goto error;
424
}
425
Py_DECREF(line);
426
return 1;
Aug 12, 2007
Aug 12, 2007
427
error:
Mar 28, 2021
Mar 28, 2021
428
Py_XDECREF(line);
429
return 0;
Aug 4, 2002
Aug 4, 2002
430
}
431
432
/* Set the readline function for TOK to a StreamReader's
433
readline function. The StreamReader is named ENC.
434
435
This function is called from check_bom and check_coding_spec.
436
437
ENC is usually identical to the future value of tok->encoding,
438
except for the (currently unsupported) case of UTF-16.
439
440
Return 1 on success, 0 on failure. */
441
442
static int
443
fp_setreadl(struct tok_state *tok, const char* enc)
444
{
Sep 13, 2016
Sep 13, 2016
445
PyObject *readline, *io, *stream;
Oct 14, 2011
Oct 14, 2011
446
_Py_IDENTIFIER(open);
447
_Py_IDENTIFIER(readline);
Oct 14, 2010
Oct 14, 2010
448
int fd;
Jan 9, 2014
Jan 9, 2014
449
long pos;
May 9, 2010
May 9, 2010
450
Oct 14, 2010
Oct 14, 2010
451
fd = fileno(tok->fp);
Jan 9, 2014
Jan 9, 2014
452
/* Due to buffering the file offset for fd can be different from the file
Feb 28, 2014
Feb 28, 2014
453
* position of tok->fp. If tok->fp was opened in text mode on Windows,
454
* its file position counts CRLF as one char and can't be directly mapped
455
* to the file offset for fd. Instead we step back one byte and read to
456
* the end of line.*/
Jan 9, 2014
Jan 9, 2014
457
pos = ftell(tok->fp);
Feb 28, 2014
Feb 28, 2014
458
if (pos == -1 ||
459
lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Oct 14, 2010
Oct 14, 2010
460
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
Sep 13, 2016
Sep 13, 2016
461
return 0;
Oct 14, 2010
Oct 14, 2010
462
}
463
Sep 13, 2016
Sep 13, 2016
464
io = PyImport_ImportModuleNoBlock("io");
465
if (io == NULL)
466
return 0;
467
Oct 9, 2011
Oct 9, 2011
468
stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Oct 14, 2010
Oct 14, 2010
469
fd, "r", -1, enc, Py_None, Py_None, Py_False);
Sep 13, 2016
Sep 13, 2016
470
Py_DECREF(io);
May 9, 2010
May 9, 2010
471
if (stream == NULL)
Sep 13, 2016
Sep 13, 2016
472
return 0;
May 9, 2010
May 9, 2010
473
Oct 10, 2011
Oct 10, 2011
474
readline = _PyObject_GetAttrId(stream, &PyId_readline);
Sep 13, 2016
Sep 13, 2016
475
Py_DECREF(stream);
476
if (readline == NULL)
477
return 0;
Apr 6, 2016
Apr 6, 2016
478
Py_XSETREF(tok->decoding_readline, readline);
Sep 13, 2016
Sep 13, 2016
479
Feb 28, 2014
Feb 28, 2014
480
if (pos > 0) {
Oct 11, 2021
Oct 11, 2021
481
PyObject *bufobj = _PyObject_CallNoArgs(readline);
Sep 13, 2016
Sep 13, 2016
482
if (bufobj == NULL)
483
return 0;
484
Py_DECREF(bufobj);
Feb 28, 2014
Feb 28, 2014
485
}
May 9, 2010
May 9, 2010
486
Sep 13, 2016
Sep 13, 2016
487
return 1;
Aug 4, 2002
Aug 4, 2002
488
}
489
490
/* Fetch the next byte from TOK. */
491
492
static int fp_getc(struct tok_state *tok) {
May 9, 2010
May 9, 2010
493
return getc(tok->fp);
Aug 4, 2002
Aug 4, 2002
494
}
495
496
/* Unfetch the last byte back into TOK. */
497
498
static void fp_ungetc(int c, struct tok_state *tok) {
May 9, 2010
May 9, 2010
499
ungetc(c, tok->fp);
Aug 4, 2002
Aug 4, 2002
500
}
501
Jul 29, 2007
Jul 29, 2007
502
/* Check whether the characters at s start a valid
503
UTF-8 sequence. Return the number of characters forming
504
the sequence if yes, 0 if not. */
505
static int valid_utf8(const unsigned char* s)
506
{
May 9, 2010
May 9, 2010
507
int expected = 0;
508
int length;
509
if (*s < 0x80)
510
/* single-byte code */
511
return 1;
512
if (*s < 0xc0)
513
/* following byte */
514
return 0;
515
if (*s < 0xE0)
516
expected = 1;
517
else if (*s < 0xF0)
518
expected = 2;
519
else if (*s < 0xF8)
520
expected = 3;
521
else
522
return 0;
523
length = expected + 1;
524
for (; expected; expected--)
525
if (s[expected] < 0x80 || s[expected] >= 0xC0)
526
return 0;
527
return length;
Jul 29, 2007
Jul 29, 2007
528
}
529
Mar 28, 2021
Mar 28, 2021
530
static int
531
ensure_utf8(char *line, struct tok_state *tok)
Aug 4, 2002
Aug 4, 2002
532
{
May 9, 2010
May 9, 2010
533
int badchar = 0;
Mar 28, 2021
Mar 28, 2021
534
unsigned char *c;
535
int length;
536
for (c = (unsigned char *)line; *c; c += length) {
537
if (!(length = valid_utf8(c))) {
538
badchar = *c;
May 9, 2010
May 9, 2010
539
break;
540
}
541
}
542
if (badchar) {
543
/* Need to add 1 to the line number, since this line
Mar 28, 2021
Mar 28, 2021
544
has not been counted, yet. */
Apr 25, 2011
Apr 25, 2011
545
PyErr_Format(PyExc_SyntaxError,
Mar 28, 2021
Mar 28, 2021
546
"Non-UTF-8 code starting with '\\x%.2x' "
547
"in file %U on line %i, "
548
"but no encoding declared; "
Jul 30, 2021
Jul 30, 2021
549
"see https://python.org/dev/peps/pep-0263/ for details",
Mar 28, 2021
Mar 28, 2021
550
badchar, tok->filename, tok->lineno + 1);
551
return 0;
May 9, 2010
May 9, 2010
552
}
Mar 28, 2021
Mar 28, 2021
553
return 1;
Aug 4, 2002
Aug 4, 2002
554
}
555
556
/* Fetch a byte from TOK, using the string buffer. */
557
Apr 21, 2006
Apr 21, 2006
558
static int
559
buf_getc(struct tok_state *tok) {
May 9, 2010
May 9, 2010
560
return Py_CHARMASK(*tok->str++);
Aug 4, 2002
Aug 4, 2002
561
}
562
563
/* Unfetch a byte from TOK, using the string buffer. */
564
Apr 21, 2006
Apr 21, 2006
565
static void
566
buf_ungetc(int c, struct tok_state *tok) {
May 9, 2010
May 9, 2010
567
tok->str--;
568
assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Aug 4, 2002
Aug 4, 2002
569
}
570
571
/* Set the readline function for TOK to ENC. For the string-based
572
tokenizer, this means to just record the encoding. */
573
Apr 21, 2006
Apr 21, 2006
574
static int
575
buf_setreadl(struct tok_state *tok, const char* enc) {
May 9, 2010
May 9, 2010
576
tok->enc = enc;
577
return 1;
Aug 4, 2002
Aug 4, 2002
578
}
579
580
/* Return a UTF-8 encoding Python string object from the
581
C byte string STR, which is encoded with ENC. */
582
583
static PyObject *
584
translate_into_utf8(const char* str, const char* enc) {
May 9, 2010
May 9, 2010
585
PyObject *utf8;
586
PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
587
if (buf == NULL)
588
return NULL;
589
utf8 = PyUnicode_AsUTF8String(buf);
590
Py_DECREF(buf);
591
return utf8;
Aug 4, 2002
Aug 4, 2002
592
}
593
Nov 13, 2009
Nov 13, 2009
594
595
static char *
596
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Jun 4, 2013
Jun 4, 2013
597
int skip_next_lf = 0;
598
size_t needed_length = strlen(s) + 2, final_length;
May 9, 2010
May 9, 2010
599
char *buf, *current;
600
char c = '\0';
Dec 1, 2020
Dec 1, 2020
601
buf = PyMem_Malloc(needed_length);
May 9, 2010
May 9, 2010
602
if (buf == NULL) {
603
tok->done = E_NOMEM;
604
return NULL;
605
}
606
for (current = buf; *s; s++, current++) {
607
c = *s;
608
if (skip_next_lf) {
609
skip_next_lf = 0;
610
if (c == '\n') {
611
c = *++s;
612
if (!c)
613
break;
614
}
615
}
616
if (c == '\r') {
617
skip_next_lf = 1;
618
c = '\n';
619
}
620
*current = c;
621
}
622
/* If this is exec input, add a newline to the end of the string if
623
there isn't one already. */
624
if (exec_input && c != '\n') {
625
*current = '\n';
626
current++;
627
}
628
*current = '\0';
629
final_length = current - buf + 1;
Mar 19, 2019
Mar 19, 2019
630
if (final_length < needed_length && final_length) {
May 9, 2010
May 9, 2010
631
/* should never fail */
Dec 1, 2020
Dec 1, 2020
632
char* result = PyMem_Realloc(buf, final_length);
Mar 19, 2019
Mar 19, 2019
633
if (result == NULL) {
Dec 1, 2020
Dec 1, 2020
634
PyMem_Free(buf);
Mar 19, 2019
Mar 19, 2019
635
}
636
buf = result;
637
}
May 9, 2010
May 9, 2010
638
return buf;
Nov 13, 2009
Nov 13, 2009
639
}
640
Aug 4, 2002
Aug 4, 2002
641
/* Decode a byte string STR for use as the buffer of TOK.
642
Look for encoding declarations inside STR, and record them
643
inside TOK. */
644
Feb 28, 2020
Feb 28, 2020
645
static char *
Nov 13, 2009
Nov 13, 2009
646
decode_str(const char *input, int single, struct tok_state *tok)
Aug 4, 2002
Aug 4, 2002
647
{
May 9, 2010
May 9, 2010
648
PyObject* utf8 = NULL;
Feb 28, 2020
Feb 28, 2020
649
char *str;
May 9, 2010
May 9, 2010
650
const char *s;
651
const char *newl[2] = {NULL, NULL};
652
int lineno = 0;
653
tok->input = str = translate_newlines(input, single, tok);
654
if (str == NULL)
655
return NULL;
656
tok->enc = NULL;
657
tok->str = str;
658
if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
659
return error_ret(tok);
660
str = tok->str; /* string after BOM if any */
661
assert(str);
662
if (tok->enc != NULL) {
663
utf8 = translate_into_utf8(str, tok->enc);
664
if (utf8 == NULL)
665
return error_ret(tok);
666
str = PyBytes_AsString(utf8);
667
}
668
for (s = str;; s++) {
669
if (*s == '\0') break;
670
else if (*s == '\n') {
671
assert(lineno < 2);
672
newl[lineno] = s;
673
lineno++;
674
if (lineno == 2) break;
675
}
676
}
677
tok->enc = NULL;
678
/* need to check line 1 and 2 separately since check_coding_spec
679
assumes a single line as input */
680
if (newl[0]) {
Mar 28, 2021
Mar 28, 2021
681
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
682
return NULL;
683
}
684
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
May 9, 2010
May 9, 2010
685
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
686
tok, buf_setreadl))
Mar 28, 2021
Mar 28, 2021
687
return NULL;
May 9, 2010
May 9, 2010
688
}
689
}
690
if (tok->enc != NULL) {
691
assert(utf8 == NULL);
692
utf8 = translate_into_utf8(str, tok->enc);
693
if (utf8 == NULL)
694
return error_ret(tok);
695
str = PyBytes_AS_STRING(utf8);
696
}
697
assert(tok->decoding_buffer == NULL);
698
tok->decoding_buffer = utf8; /* CAUTION */
699
return str;
Aug 4, 2002
Aug 4, 2002
700
}
701
Oct 14, 1990
Oct 14, 1990
702
/* Set up tokenizer for string */
703
704
struct tok_state *
Oct 13, 2021
Oct 13, 2021
705
_PyTokenizer_FromString(const char *str, int exec_input)
Oct 14, 1990
Oct 14, 1990
706
{
May 9, 2010
May 9, 2010
707
struct tok_state *tok = tok_new();
Feb 28, 2020
Feb 28, 2020
708
char *decoded;
709
May 9, 2010
May 9, 2010
710
if (tok == NULL)
711
return NULL;
Feb 28, 2020
Feb 28, 2020
712
decoded = decode_str(str, exec_input, tok);
713
if (decoded == NULL) {
Oct 13, 2021
Oct 13, 2021
714
_PyTokenizer_Free(tok);
May 9, 2010
May 9, 2010
715
return NULL;
716
}
717
Feb 28, 2020
Feb 28, 2020
718
tok->buf = tok->cur = tok->inp = decoded;
719
tok->end = decoded;
May 9, 2010
May 9, 2010
720
return tok;
Oct 14, 1990
Oct 14, 1990
721
}
722
Mar 28, 2021
Mar 28, 2021
723
/* Set up tokenizer for UTF-8 string */
724
Mar 2, 2009
Mar 2, 2009
725
struct tok_state *
Oct 13, 2021
Oct 13, 2021
726
_PyTokenizer_FromUTF8(const char *str, int exec_input)
Mar 2, 2009
Mar 2, 2009
727
{
May 9, 2010
May 9, 2010
728
struct tok_state *tok = tok_new();
Feb 28, 2020
Feb 28, 2020
729
char *translated;
May 9, 2010
May 9, 2010
730
if (tok == NULL)
731
return NULL;
Feb 28, 2020
Feb 28, 2020
732
tok->input = translated = translate_newlines(str, exec_input, tok);
733
if (translated == NULL) {
Oct 13, 2021
Oct 13, 2021
734
_PyTokenizer_Free(tok);
May 9, 2010
May 9, 2010
735
return NULL;
736
}
Mar 28, 2021
Mar 28, 2021
737
tok->decoding_state = STATE_NORMAL;
May 9, 2010
May 9, 2010
738
tok->enc = NULL;
Feb 28, 2020
Feb 28, 2020
739
tok->str = translated;
Mar 28, 2021
Mar 28, 2021
740
tok->encoding = new_string("utf-8", 5, tok);
May 9, 2010
May 9, 2010
741
if (!tok->encoding) {
Oct 13, 2021
Oct 13, 2021
742
_PyTokenizer_Free(tok);
May 9, 2010
May 9, 2010
743
return NULL;
744
}
745
Feb 28, 2020
Feb 28, 2020
746
tok->buf = tok->cur = tok->inp = translated;
747
tok->end = translated;
May 9, 2010
May 9, 2010
748
return tok;
Mar 2, 2009
Mar 2, 2009
749
}
750
Jul 27, 1991
Jul 27, 1991
751
/* Set up tokenizer for file */
Oct 14, 1990
Oct 14, 1990
752
753
struct tok_state *
Oct 13, 2021
Oct 13, 2021
754
_PyTokenizer_FromFile(FILE *fp, const char* enc,
755
const char *ps1, const char *ps2)
Oct 14, 1990
Oct 14, 1990
756
{
May 9, 2010
May 9, 2010
757
struct tok_state *tok = tok_new();
758
if (tok == NULL)
759
return NULL;
Dec 1, 2020
Dec 1, 2020
760
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Oct 13, 2021
Oct 13, 2021
761
_PyTokenizer_Free(tok);
May 9, 2010
May 9, 2010
762
return NULL;
763
}
764
tok->cur = tok->inp = tok->buf;
765
tok->end = tok->buf + BUFSIZ;
766
tok->fp = fp;
767
tok->prompt = ps1;
768
tok->nextprompt = ps2;
769
if (enc != NULL) {
770
/* Must copy encoding declaration since it
771
gets copied into the parse tree. */
Mar 28, 2021
Mar 28, 2021
772
tok->encoding = new_string(enc, strlen(enc), tok);
May 9, 2010
May 9, 2010
773
if (!tok->encoding) {
Oct 13, 2021
Oct 13, 2021
774
_PyTokenizer_Free(tok);
May 9, 2010
May 9, 2010
775
return NULL;
776
}
777
tok->decoding_state = STATE_NORMAL;
778
}
779
return tok;
Oct 14, 1990
Oct 14, 1990
780
}
781
782
/* Free a tok_state structure */
783
784
void
Oct 13, 2021
Oct 13, 2021
785
_PyTokenizer_Free(struct tok_state *tok)
Oct 14, 1990
Oct 14, 1990
786
{
Mar 14, 2021
Mar 14, 2021
787
if (tok->encoding != NULL) {
Dec 1, 2020
Dec 1, 2020
788
PyMem_Free(tok->encoding);
Mar 14, 2021
Mar 14, 2021
789
}
May 9, 2010
May 9, 2010
790
Py_XDECREF(tok->decoding_readline);
791
Py_XDECREF(tok->decoding_buffer);
Apr 4, 2011
Apr 4, 2011
792
Py_XDECREF(tok->filename);
Mar 14, 2021
Mar 14, 2021
793
if (tok->fp != NULL && tok->buf != NULL) {
Dec 1, 2020
Dec 1, 2020
794
PyMem_Free(tok->buf);
Mar 14, 2021
Mar 14, 2021
795
}
796
if (tok->input) {
Dec 1, 2020
Dec 1, 2020
797
PyMem_Free(tok->input);
Mar 14, 2021
Mar 14, 2021
798
}
799
if (tok->interactive_src_start != NULL) {
800
PyMem_Free(tok->interactive_src_start);
801
}
Dec 1, 2020
Dec 1, 2020
802
PyMem_Free(tok);
Oct 14, 1990
Oct 14, 1990
803
}
804
Mar 28, 2021
Mar 28, 2021
805
static int
806
tok_readline_raw(struct tok_state *tok)
807
{
808
do {
809
if (!tok_reserve_buf(tok, BUFSIZ)) {
810
return 0;
811
}
812
char *line = Py_UniversalNewlineFgets(tok->inp,
813
(int)(tok->end - tok->inp),
814
tok->fp, NULL);
815
if (line == NULL) {
816
return 1;
817
}
818
if (tok->fp_interactive &&
819
tok_concatenate_interactive_new_line(tok, line) == -1) {
820
return 0;
821
}
Mar 29, 2021
Mar 29, 2021
822
if (*tok->inp == '\0') {
823
return 0;
824
}
Mar 28, 2021
Mar 28, 2021
825
tok->inp = strchr(tok->inp, '\0');
826
} while (tok->inp[-1] != '\n');
827
return 1;
828
}
829
830
static int
831
tok_underflow_string(struct tok_state *tok) {
832
char *end = strchr(tok->inp, '\n');
833
if (end != NULL) {
834
end++;
835
}
836
else {
837
end = strchr(tok->inp, '\0');
838
if (end == tok->inp) {
839
tok->done = E_EOF;
840
return 0;
841
}
842
}
843
if (tok->start == NULL) {
844
tok->buf = tok->cur;
845
}
846
tok->line_start = tok->cur;
847
tok->lineno++;
848
tok->inp = end;
849
return 1;
850
}
851
852
static int
853
tok_underflow_interactive(struct tok_state *tok) {
May 22, 2021
May 22, 2021
854
if (tok->interactive_underflow == IUNDERFLOW_STOP) {
855
tok->done = E_INTERACT_STOP;
856
return 1;
857
}
Mar 28, 2021
Mar 28, 2021
858
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
859
if (newtok != NULL) {
860
char *translated = translate_newlines(newtok, 0, tok);
861
PyMem_Free(newtok);
862
if (translated == NULL) {
863
return 0;
864
}
865
newtok = translated;
866
}
867
if (tok->encoding && newtok && *newtok) {
868
/* Recode to UTF-8 */
869
Py_ssize_t buflen;
870
const char* buf;
871
PyObject *u = translate_into_utf8(newtok, tok->encoding);
872
PyMem_Free(newtok);
873
if (u == NULL) {
874
tok->done = E_DECODE;
875
return 0;
876
}
877
buflen = PyBytes_GET_SIZE(u);
878
buf = PyBytes_AS_STRING(u);
879
newtok = PyMem_Malloc(buflen+1);
880
if (newtok == NULL) {
881
Py_DECREF(u);
882
tok->done = E_NOMEM;
883
return 0;
884
}
885
strcpy(newtok, buf);
886
Py_DECREF(u);
887
}
888
if (tok->fp_interactive &&
889
tok_concatenate_interactive_new_line(tok, newtok) == -1) {
890
PyMem_Free(newtok);
891
return 0;
892
}
893
if (tok->nextprompt != NULL) {
894
tok->prompt = tok->nextprompt;
895
}
896
if (newtok == NULL) {
897
tok->done = E_INTR;
898
}
899
else if (*newtok == '\0') {
900
PyMem_Free(newtok);
901
tok->done = E_EOF;
902
}
903
else if (tok->start != NULL) {
904
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
905
size_t size = strlen(newtok);
906
tok->lineno++;
907
if (!tok_reserve_buf(tok, size + 1)) {
908
PyMem_Free(tok->buf);
909
tok->buf = NULL;
910
PyMem_Free(newtok);
911
return 0;
912
}
913
memcpy(tok->cur, newtok, size + 1);
914
PyMem_Free(newtok);
915
tok->inp += size;
916
tok->multi_line_start = tok->buf + cur_multi_line_start;
917
}
918
else {
919
tok->lineno++;
920
PyMem_Free(tok->buf);
921
tok->buf = newtok;
922
tok->cur = tok->buf;
923
tok->line_start = tok->buf;
924
tok->inp = strchr(tok->buf, '\0');
925
tok->end = tok->inp + 1;
926
}
927
if (tok->done != E_OK) {
928
if (tok->prompt != NULL) {
929
PySys_WriteStderr("\n");
930
}
931
return 0;
932
}
933
return 1;
934
}
935
936
static int
937
tok_underflow_file(struct tok_state *tok) {
938
if (tok->start == NULL) {
939
tok->cur = tok->inp = tok->buf;
940
}
941
if (tok->decoding_state == STATE_INIT) {
942
/* We have not yet determined the encoding.
943
If an encoding is found, use the file-pointer
944
reader functions from now on. */
945
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
946
error_ret(tok);
947
return 0;
948
}
949
assert(tok->decoding_state != STATE_INIT);
950
}
951
/* Read until '\n' or EOF */
952
if (tok->decoding_readline != NULL) {
953
/* We already have a codec associated with this input. */
954
if (!tok_readline_recode(tok)) {
955
return 0;
956
}
957
}
958
else {
959
/* We want a 'raw' read. */
960
if (!tok_readline_raw(tok)) {
961
return 0;
962
}
963
}
964
if (tok->inp == tok->cur) {
965
tok->done = E_EOF;
966
return 0;
967
}
968
if (tok->inp[-1] != '\n') {
969
/* Last line does not end in \n, fake one */
970
*tok->inp++ = '\n';
971
*tok->inp = '\0';
972
}
973
974
tok->lineno++;
975
if (tok->decoding_state != STATE_NORMAL) {
976
if (tok->lineno > 2) {
977
tok->decoding_state = STATE_NORMAL;
978
}
Mar 29, 2021
Mar 29, 2021
979
else if (!check_coding_spec(tok->cur, strlen(tok->cur),
Mar 28, 2021
Mar 28, 2021
980
tok, fp_setreadl))
981
{
982
return 0;
983
}
984
}
985
/* The default encoding is UTF-8, so make sure we don't have any
986
non-UTF-8 sequences in it. */
987
if (!tok->encoding
988
&& (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
989
if (!ensure_utf8(tok->cur, tok)) {
990
error_ret(tok);
991
return 0;
992
}
993
}
994
assert(tok->done == E_OK);
995
return tok->done == E_OK;
996
}
997
998
static void
999
print_escape(FILE *f, const char *s, Py_ssize_t size)
1000
{