Skip to content

Commit bab1d7a

Browse files
gh-74902: Add Unicode Grapheme Cluster Break algorithm (GH-143076)
Add the unicodedata.iter_graphemes() function to iterate over grapheme clusters according to rules defined in Unicode Standard Annex #29. Add unicodedata.grapheme_cluster_break(), unicodedata.indic_conjunct_break() and unicodedata.extended_pictographic() functions to get the properties of the character which are related to the above algorithm. Co-authored-by: Guillaume "Vermeille" Sanchez <[email protected]>
1 parent 0e0d51c commit bab1d7a

File tree

9 files changed

+4359
-3048
lines changed

9 files changed

+4359
-3048
lines changed

Doc/library/unicodedata.rst

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,28 @@ following functions:
184184
'0041 0303'
185185

186186

187+
.. function:: grapheme_cluster_break(chr, /)
188+
189+
Returns the Grapheme_Cluster_Break property assigned to the character.
190+
191+
.. versionadded:: next
192+
193+
194+
.. function:: indic_conjunct_break(chr, /)
195+
196+
Returns the Indic_Conjunct_Break property assigned to the character.
197+
198+
.. versionadded:: next
199+
200+
201+
.. function:: extended_pictographic(chr, /)
202+
203+
Returns ``True`` if the character has the Extended_Pictographic property,
204+
``False`` otherwise.
205+
206+
.. versionadded:: next
207+
208+
187209
.. function:: normalize(form, unistr, /)
188210

189211
Return the normal form *form* for the Unicode string *unistr*. Valid values for
@@ -225,6 +247,24 @@ following functions:
225247
.. versionadded:: 3.8
226248

227249

250+
.. function:: iter_graphemes(unistr, start=0, end=sys.maxsize, /)
251+
252+
Returns an iterator to iterate over grapheme clusters.
253+
With optional *start*, iteration begins at that position.
254+
With optional *end*, iteration stops at that position.
255+
256+
Converting an emitted item to string returns a substring corresponding to
257+
the grapheme cluster.
258+
Its ``start`` and ``end`` attributes denote the start and end of
259+
the grapheme cluster.
260+
261+
It uses extended grapheme cluster rules defined by Unicode
262+
Standard Annex #29, `"Unicode Text Segmentation"
263+
<https://www.unicode.org/reports/tr29/>`_.
264+
265+
.. versionadded:: next
266+
267+
228268
In addition, the module exposes the following constant:
229269

230270
.. data:: unidata_version
@@ -234,7 +274,7 @@ In addition, the module exposes the following constant:
234274

235275
.. data:: ucd_3_2_0
236276

237-
This is an object that has the same methods as the entire module, but uses the
277+
This is an object that has most of the methods of the entire module, but uses the
238278
Unicode database version 3.2 instead, for applications that require this
239279
specific version of the Unicode database (such as IDNA).
240280

Doc/whatsnew/3.15.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,16 @@ unicodedata
811811
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
812812
(Contributed by Stan Ulbrych in :gh:`129117`.)
813813

814+
* Add the :func:`~unicodedata.iter_graphemes`
815+
function to iterate over grapheme clusters according to rules defined in
816+
`Unicode Standard Annex #29, "Unicode Text Segmentation"
817+
<https://www.unicode.org/reports/tr29/>`_.
818+
Add :func:`~unicodedata.grapheme_cluster_break`,
819+
:func:`~unicodedata.indic_conjunct_break` and
820+
:func:`~unicodedata.extended_pictographic` functions to get the properties
821+
of the character which are related to the above algorithm.
822+
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
823+
814824

815825
unittest
816826
--------

Lib/test/test_unicodedata.py

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,221 @@ def test_isxidcontinue(self):
616616
self.assertRaises(TypeError, self.db.isxidcontinue)
617617
self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')
618618

619+
def test_grapheme_cluster_break(self):
620+
gcb = self.db.grapheme_cluster_break
621+
self.assertEqual(gcb(' '), 'Other')
622+
self.assertEqual(gcb('x'), 'Other')
623+
self.assertEqual(gcb('\U0010FFFF'), 'Other')
624+
self.assertEqual(gcb('\r'), 'CR')
625+
self.assertEqual(gcb('\n'), 'LF')
626+
self.assertEqual(gcb('\0'), 'Control')
627+
self.assertEqual(gcb('\t'), 'Control')
628+
self.assertEqual(gcb('\x1F'), 'Control')
629+
self.assertEqual(gcb('\x7F'), 'Control')
630+
self.assertEqual(gcb('\x9F'), 'Control')
631+
self.assertEqual(gcb('\U000E0001'), 'Control')
632+
self.assertEqual(gcb('\u0300'), 'Extend')
633+
self.assertEqual(gcb('\u200C'), 'Extend')
634+
self.assertEqual(gcb('\U000E01EF'), 'Extend')
635+
self.assertEqual(gcb('\u1159'), 'L')
636+
self.assertEqual(gcb('\u11F9'), 'T')
637+
self.assertEqual(gcb('\uD788'), 'LV')
638+
self.assertEqual(gcb('\uD7A3'), 'LVT')
639+
# New in 5.0.0
640+
self.assertEqual(gcb('\u05BA'), 'Extend')
641+
self.assertEqual(gcb('\u20EF'), 'Extend')
642+
# New in 5.1.0
643+
self.assertEqual(gcb('\u2064'), 'Control')
644+
self.assertEqual(gcb('\uAA4D'), 'SpacingMark')
645+
# New in 5.2.0
646+
self.assertEqual(gcb('\u0816'), 'Extend')
647+
self.assertEqual(gcb('\uA97C'), 'L')
648+
self.assertEqual(gcb('\uD7C6'), 'V')
649+
self.assertEqual(gcb('\uD7FB'), 'T')
650+
# New in 6.0.0
651+
self.assertEqual(gcb('\u093A'), 'Extend')
652+
self.assertEqual(gcb('\U00011002'), 'SpacingMark')
653+
# New in 6.1.0
654+
self.assertEqual(gcb('\U000E0FFF'), 'Control')
655+
self.assertEqual(gcb('\U00016F7E'), 'SpacingMark')
656+
# New in 6.2.0
657+
self.assertEqual(gcb('\U0001F1E6'), 'Regional_Indicator')
658+
self.assertEqual(gcb('\U0001F1FF'), 'Regional_Indicator')
659+
# New in 6.3.0
660+
self.assertEqual(gcb('\u180E'), 'Control')
661+
self.assertEqual(gcb('\u1A1B'), 'Extend')
662+
# New in 7.0.0
663+
self.assertEqual(gcb('\u0E33'), 'SpacingMark')
664+
self.assertEqual(gcb('\u0EB3'), 'SpacingMark')
665+
self.assertEqual(gcb('\U0001BCA3'), 'Control')
666+
self.assertEqual(gcb('\U0001E8D6'), 'Extend')
667+
self.assertEqual(gcb('\U0001163E'), 'SpacingMark')
668+
# New in 8.0.0
669+
self.assertEqual(gcb('\u08E3'), 'Extend')
670+
self.assertEqual(gcb('\U00011726'), 'SpacingMark')
671+
# New in 9.0.0
672+
self.assertEqual(gcb('\u0600'), 'Prepend')
673+
self.assertEqual(gcb('\U000E007F'), 'Extend')
674+
self.assertEqual(gcb('\U00011CB4'), 'SpacingMark')
675+
self.assertEqual(gcb('\u200D'), 'ZWJ')
676+
# New in 10.0.0
677+
self.assertEqual(gcb('\U00011D46'), 'Prepend')
678+
self.assertEqual(gcb('\U00011D47'), 'Extend')
679+
self.assertEqual(gcb('\U00011A97'), 'SpacingMark')
680+
# New in 11.0.0
681+
self.assertEqual(gcb('\U000110CD'), 'Prepend')
682+
self.assertEqual(gcb('\u07FD'), 'Extend')
683+
self.assertEqual(gcb('\U00011EF6'), 'SpacingMark')
684+
# New in 12.0.0
685+
self.assertEqual(gcb('\U00011A84'), 'Prepend')
686+
self.assertEqual(gcb('\U00013438'), 'Control')
687+
self.assertEqual(gcb('\U0001E2EF'), 'Extend')
688+
self.assertEqual(gcb('\U00016F87'), 'SpacingMark')
689+
# New in 13.0.0
690+
self.assertEqual(gcb('\U00011941'), 'Prepend')
691+
self.assertEqual(gcb('\U00016FE4'), 'Extend')
692+
self.assertEqual(gcb('\U00011942'), 'SpacingMark')
693+
# New in 14.0.0
694+
self.assertEqual(gcb('\u0891'), 'Prepend')
695+
self.assertEqual(gcb('\U0001E2AE'), 'Extend')
696+
# New in 15.0.0
697+
self.assertEqual(gcb('\U00011F02'), 'Prepend')
698+
self.assertEqual(gcb('\U0001343F'), 'Control')
699+
self.assertEqual(gcb('\U0001E4EF'), 'Extend')
700+
self.assertEqual(gcb('\U00011F3F'), 'SpacingMark')
701+
# New in 16.0.0
702+
self.assertEqual(gcb('\U000113D1'), 'Prepend')
703+
self.assertEqual(gcb('\U0001E5EF'), 'Extend')
704+
self.assertEqual(gcb('\U0001612C'), 'SpacingMark')
705+
self.assertEqual(gcb('\U00016D63'), 'V')
706+
# New in 17.0.0
707+
self.assertEqual(gcb('\u1AEB'), 'Extend')
708+
self.assertEqual(gcb('\U00011B67'), 'SpacingMark')
709+
710+
self.assertRaises(TypeError, gcb)
711+
self.assertRaises(TypeError, gcb, b'x')
712+
self.assertRaises(TypeError, gcb, 120)
713+
self.assertRaises(TypeError, gcb, '')
714+
self.assertRaises(TypeError, gcb, 'xx')
715+
716+
def test_indic_conjunct_break(self):
717+
incb = self.db.indic_conjunct_break
718+
self.assertEqual(incb(' '), 'None')
719+
self.assertEqual(incb('x'), 'None')
720+
self.assertEqual(incb('\U0010FFFF'), 'None')
721+
# New in 15.1.0
722+
self.assertEqual(incb('\u094D'), 'Linker')
723+
self.assertEqual(incb('\u0D4D'), 'Linker')
724+
self.assertEqual(incb('\u0915'), 'Consonant')
725+
self.assertEqual(incb('\u0D3A'), 'Consonant')
726+
self.assertEqual(incb('\u0300'), 'Extend')
727+
self.assertEqual(incb('\U0001E94A'), 'Extend')
728+
# New in 16.0.0
729+
self.assertEqual(incb('\u034F'), 'Extend')
730+
self.assertEqual(incb('\U000E01EF'), 'Extend')
731+
# New in 17.0.0
732+
self.assertEqual(incb('\u1039'), 'Linker')
733+
self.assertEqual(incb('\U00011F42'), 'Linker')
734+
self.assertEqual(incb('\u1000'), 'Consonant')
735+
self.assertEqual(incb('\U00011F33'), 'Consonant')
736+
self.assertEqual(incb('\U0001E6F5'), 'Extend')
737+
738+
self.assertRaises(TypeError, incb)
739+
self.assertRaises(TypeError, incb, b'x')
740+
self.assertRaises(TypeError, incb, 120)
741+
self.assertRaises(TypeError, incb, '')
742+
self.assertRaises(TypeError, incb, 'xx')
743+
744+
def test_extended_pictographic(self):
745+
ext_pict = self.db.extended_pictographic
746+
self.assertIs(ext_pict(' '), False)
747+
self.assertIs(ext_pict('x'), False)
748+
self.assertIs(ext_pict('\U0010FFFF'), False)
749+
# New in 13.0.0
750+
self.assertIs(ext_pict('\xA9'), True)
751+
self.assertIs(ext_pict('\u203C'), True)
752+
self.assertIs(ext_pict('\U0001FAD6'), True)
753+
self.assertIs(ext_pict('\U0001FFFD'), True)
754+
# New in 17.0.0
755+
self.assertIs(ext_pict('\u2388'), False)
756+
self.assertIs(ext_pict('\U0001FA6D'), False)
757+
758+
self.assertRaises(TypeError, ext_pict)
759+
self.assertRaises(TypeError, ext_pict, b'x')
760+
self.assertRaises(TypeError, ext_pict, 120)
761+
self.assertRaises(TypeError, ext_pict, '')
762+
self.assertRaises(TypeError, ext_pict, 'xx')
763+
764+
def test_grapheme_break(self):
765+
def graphemes(*args):
766+
return list(map(str, self.db.iter_graphemes(*args)))
767+
768+
self.assertRaises(TypeError, self.db.iter_graphemes)
769+
self.assertRaises(TypeError, self.db.iter_graphemes, b'x')
770+
self.assertRaises(TypeError, self.db.iter_graphemes, 'x', 0, 0, 0)
771+
772+
self.assertEqual(graphemes(''), [])
773+
self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
774+
self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
775+
self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
776+
self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
777+
self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
778+
self.assertEqual(graphemes('abcd', 3, 1), [])
779+
self.assertEqual(graphemes('abcd', 5), [])
780+
self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
781+
self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
782+
self.assertEqual(graphemes('abcd', 0, -5), [])
783+
# GB3
784+
self.assertEqual(graphemes('\r\n'), ['\r\n'])
785+
# GB4
786+
self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
787+
self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
788+
self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
789+
# GB5
790+
self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
791+
self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
792+
self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
793+
# GB6
794+
self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
795+
self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
796+
self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
797+
# GB7
798+
self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
799+
self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
800+
self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
801+
self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
802+
# GB8
803+
self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
804+
self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
805+
# GB9
806+
self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
807+
self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
808+
# GB9a
809+
self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
810+
# GB9b
811+
self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
812+
# GB9c
813+
self.assertEqual(graphemes('\u0915\u094d\u0924'),
814+
['\u0915\u094d\u0924'])
815+
self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
816+
['\u0915\u094D\u094D\u0924'])
817+
self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
818+
['\u0915\u094D\u0924\u094D\u092F'])
819+
# GB11
820+
self.assertEqual(graphemes(
821+
'\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
822+
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
823+
['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
824+
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
825+
# GB12
826+
self.assertEqual(graphemes(
827+
'\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
828+
['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
829+
# GB13
830+
self.assertEqual(graphemes(
831+
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
832+
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
833+
619834

620835
class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
621836
db = unicodedata.ucd_3_2_0
@@ -624,6 +839,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
624839
if quicktest else
625840
'f217b8688d7bdff31db4207e078a96702f091597')
626841

842+
test_grapheme_cluster_break = None
843+
test_indic_conjunct_break = None
844+
test_extended_pictographic = None
845+
test_grapheme_break = None
846+
627847

628848
class UnicodeMiscTest(unittest.TestCase):
629849
db = unicodedata
@@ -726,6 +946,17 @@ def test_linebreak_7643(self):
726946
self.assertEqual(len(lines), 1,
727947
r"%a should not be a linebreak" % c)
728948

949+
def test_segment_object(self):
950+
segments = list(unicodedata.iter_graphemes('spa\u0300m'))
951+
self.assertEqual(len(segments), 4, segments)
952+
segment = segments[2]
953+
self.assertEqual(segment.start, 2)
954+
self.assertEqual(segment.end, 4)
955+
self.assertEqual(str(segment), 'a\u0300')
956+
self.assertEqual(repr(segment), '<Segment 2:4>')
957+
self.assertRaises(TypeError, iter, segment)
958+
self.assertRaises(TypeError, len, segment)
959+
729960

730961
class NormalizationTest(unittest.TestCase):
731962
@staticmethod
@@ -848,5 +1079,61 @@ class MyStr(str):
8481079
self.assertIs(type(normalize(form, MyStr(input_str))), str)
8491080

8501081

1082+
class GraphemeBreakTest(unittest.TestCase):
1083+
@staticmethod
1084+
def check_version(testfile):
1085+
hdr = testfile.readline()
1086+
return unicodedata.unidata_version in hdr
1087+
1088+
@requires_resource('network')
1089+
def test_grapheme_break(self):
1090+
TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
1091+
TESTDATAURL = f"https://www.unicode.org/Public/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"
1092+
1093+
# Hit the exception early
1094+
try:
1095+
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
1096+
check=self.check_version)
1097+
except PermissionError:
1098+
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
1099+
f"into the test data directory")
1100+
except (OSError, HTTPException) as exc:
1101+
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
1102+
1103+
with testdata:
1104+
self.run_grapheme_break_tests(testdata)
1105+
1106+
def run_grapheme_break_tests(self, testdata):
1107+
for line in testdata:
1108+
line, _, comment = line.partition('#')
1109+
line = line.strip()
1110+
if not line:
1111+
continue
1112+
comment = comment.strip()
1113+
1114+
chunks = []
1115+
breaks = []
1116+
pos = 0
1117+
for field in line.replace('×', ' ').split():
1118+
if field == '÷':
1119+
chunks.append('')
1120+
breaks.append(pos)
1121+
else:
1122+
chunks[-1] += chr(int(field, 16))
1123+
pos += 1
1124+
self.assertEqual(chunks.pop(), '', line)
1125+
input = ''.join(chunks)
1126+
with self.subTest(line):
1127+
result = list(unicodedata.iter_graphemes(input))
1128+
self.assertEqual(list(map(str, result)), chunks, comment)
1129+
self.assertEqual([x.start for x in result], breaks[:-1], comment)
1130+
self.assertEqual([x.end for x in result], breaks[1:], comment)
1131+
for i in range(1, len(breaks) - 1):
1132+
result = list(unicodedata.iter_graphemes(input, breaks[i]))
1133+
self.assertEqual(list(map(str, result)), chunks[i:], comment)
1134+
self.assertEqual([x.start for x in result], breaks[i:-1], comment)
1135+
self.assertEqual([x.end for x in result], breaks[i+1:], comment)
1136+
1137+
8511138
if __name__ == "__main__":
8521139
unittest.main()

0 commit comments

Comments
 (0)