Skip to content

Commit 9686545

Browse files
author
Victor Stinner
committed
Issue #10829: Refactor PyUnicode_FromFormat()
* Use the same function to parse the format string in the 3 steps * Fix crashs on invalid format strings
1 parent 096f1a8 commit 9686545

3 files changed

Lines changed: 104 additions & 69 deletions

File tree

Lib/test/test_unicode.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1455,9 +1455,28 @@ def PyUnicode_FromFormat(format, *args):
14551455
'string, got a non-ASCII byte: 0xe9$',
14561456
PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
14571457

1458+
# test "%c"
14581459
self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
14591460
self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')
14601461

1462+
# test "%"
1463+
self.assertEqual(PyUnicode_FromFormat(b'%'), '%')
1464+
self.assertEqual(PyUnicode_FromFormat(b'%%'), '%')
1465+
self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s')
1466+
self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]')
1467+
self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc')
1468+
1469+
# test "%i"
1470+
self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010')
1471+
self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010')
1472+
1473+
# not supported: copy the raw format string. these tests are just here
1474+
# to check for crashs and should not be considered as specifications
1475+
self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s')
1476+
self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc')
1477+
self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i')
1478+
self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s')
1479+
14611480
# other tests
14621481
text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
14631482
self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ What's New in Python 3.3 Alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #10829: Refactor PyUnicode_FromFormat(), use the same function to parse
14+
the format string in the 3 steps, fix crashs on invalid format strings.
15+
1316
- Issue #11246: Fix PyUnicode_FromFormat("%V") to decode the byte string from
1417
UTF-8 (with replace error handler) instead of ISO-8859-1 (in strict mode).
1518
Patch written by Ray Allen.

Objects/unicodeobject.c

Lines changed: 82 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,70 @@ makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
714714
*fmt = '\0';
715715
}
716716

717+
/* helper for PyUnicode_FromFormatV() */
718+
719+
static const char*
720+
parse_format_flags(const char *f,
721+
int *p_width, int *p_precision,
722+
int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723+
{
724+
int width, precision, longflag, longlongflag, size_tflag;
725+
726+
/* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727+
f++;
728+
width = 0;
729+
while (Py_ISDIGIT((unsigned)*f))
730+
width = (width*10) + *f++ - '0';
731+
precision = 0;
732+
if (*f == '.') {
733+
f++;
734+
while (Py_ISDIGIT((unsigned)*f))
735+
precision = (precision*10) + *f++ - '0';
736+
if (*f == '%') {
737+
/* "%.3%s" => f points to "3" */
738+
f--;
739+
}
740+
}
741+
if (*f == '\0') {
742+
/* bogus format "%.1" => go backward, f points to "1" */
743+
f--;
744+
}
745+
if (p_width != NULL)
746+
*p_width = width;
747+
if (p_precision != NULL)
748+
*p_precision = precision;
749+
750+
/* Handle %ld, %lu, %lld and %llu. */
751+
longflag = 0;
752+
longlongflag = 0;
753+
754+
if (*f == 'l') {
755+
if (f[1] == 'd' || f[1] == 'u') {
756+
longflag = 1;
757+
++f;
758+
}
759+
#ifdef HAVE_LONG_LONG
760+
else if (f[1] == 'l' &&
761+
(f[2] == 'd' || f[2] == 'u')) {
762+
longlongflag = 1;
763+
f += 2;
764+
}
765+
#endif
766+
}
767+
/* handle the size_t flag. */
768+
else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
769+
size_tflag = 1;
770+
++f;
771+
}
772+
if (p_longflag != NULL)
773+
*p_longflag = longflag;
774+
if (p_longlongflag != NULL)
775+
*p_longlongflag = longlongflag;
776+
if (p_size_tflag != NULL)
777+
*p_size_tflag = size_tflag;
778+
return f;
779+
}
780+
717781
#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
718782

719783
/* size of fixed-size buffer for formatting single arguments */
@@ -757,15 +821,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
757821
* result in an array) */
758822
for (f = format; *f; f++) {
759823
if (*f == '%') {
760-
if (*(f+1)=='%')
761-
continue;
762-
if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
763-
++callcount;
764-
while (Py_ISDIGIT((unsigned)*f))
765-
width = (width*10) + *f++ - '0';
766-
while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
767-
;
768-
if (*f == 's')
824+
/* skip width or width.precision (eg. "1.2" of "%1.2f") */
825+
f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
826+
if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
769827
++callcount;
770828
}
771829
else if (128 <= (unsigned char)*f) {
@@ -790,33 +848,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
790848
for (f = format; *f; f++) {
791849
if (*f == '%') {
792850
#ifdef HAVE_LONG_LONG
793-
int longlongflag = 0;
851+
int longlongflag;
794852
#endif
795-
const char* p = f;
796-
width = 0;
797-
while (Py_ISDIGIT((unsigned)*f))
798-
width = (width*10) + *f++ - '0';
799-
while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
800-
;
801-
802-
/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
803-
* they don't affect the amount of space we reserve.
804-
*/
805-
if (*f == 'l') {
806-
if (f[1] == 'd' || f[1] == 'u') {
807-
++f;
808-
}
809-
#ifdef HAVE_LONG_LONG
810-
else if (f[1] == 'l' &&
811-
(f[2] == 'd' || f[2] == 'u')) {
812-
longlongflag = 1;
813-
f += 2;
814-
}
815-
#endif
816-
}
817-
else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
818-
++f;
819-
}
853+
const char* p;
854+
855+
p = f;
856+
f = parse_format_flags(f, &width, NULL,
857+
NULL, &longlongflag, NULL);
820858

821859
switch (*f) {
822860
case 'c':
@@ -981,40 +1019,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
9811019

9821020
for (f = format; *f; f++) {
9831021
if (*f == '%') {
984-
const char* p = f++;
985-
int longflag = 0;
986-
int longlongflag = 0;
987-
int size_tflag = 0;
988-
zeropad = (*f == '0');
989-
/* parse the width.precision part */
990-
width = 0;
991-
while (Py_ISDIGIT((unsigned)*f))
992-
width = (width*10) + *f++ - '0';
993-
precision = 0;
994-
if (*f == '.') {
995-
f++;
996-
while (Py_ISDIGIT((unsigned)*f))
997-
precision = (precision*10) + *f++ - '0';
998-
}
999-
/* Handle %ld, %lu, %lld and %llu. */
1000-
if (*f == 'l') {
1001-
if (f[1] == 'd' || f[1] == 'u') {
1002-
longflag = 1;
1003-
++f;
1004-
}
1005-
#ifdef HAVE_LONG_LONG
1006-
else if (f[1] == 'l' &&
1007-
(f[2] == 'd' || f[2] == 'u')) {
1008-
longlongflag = 1;
1009-
f += 2;
1010-
}
1011-
#endif
1012-
}
1013-
/* handle the size_t flag. */
1014-
if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1015-
size_tflag = 1;
1016-
++f;
1017-
}
1022+
const char* p;
1023+
int longflag;
1024+
int longlongflag;
1025+
int size_tflag;
1026+
1027+
p = f;
1028+
zeropad = (f[1] == '0');
1029+
f = parse_format_flags(f, &width, &precision,
1030+
&longflag, &longlongflag, &size_tflag);
10181031

10191032
switch (*f) {
10201033
case 'c':

0 commit comments

Comments
 (0)