44import pprint
55import unittest
66
7+ from unittest .mock import patch
8+
79
810class EventCollector (html .parser .HTMLParser ):
911
@@ -315,6 +317,16 @@ def get_events(self):
315317 ("endtag" , element_lower )],
316318 collector = Collector (convert_charrefs = False ))
317319
320+ def test_EOF_in_cdata (self ):
321+ content = """<!-- not a comment --> ¬-an-entity-ref;
322+ <a href="" /> </p><p> <span></span></style>
323+ '</script' + '>'"""
324+ s = f'<script>{ content } '
325+ self ._run_check (s , [
326+ ("starttag" , 'script' , []),
327+ ("data" , content )
328+ ])
329+
318330 def test_comments (self ):
319331 html = ("<!-- I'm a valid comment -->"
320332 '<!--me too!-->'
@@ -346,18 +358,16 @@ def test_convert_charrefs(self):
346358 collector = lambda : EventCollectorCharrefs ()
347359 self .assertTrue (collector ().convert_charrefs )
348360 charrefs = ['"' , '"' , '"' , '"' , '"' , '"' ]
349- # check charrefs in the middle of the text/attributes
350- expected = [('starttag' , 'a' , [('href' , 'foo"zar' )]),
351- ('data' , 'a"z' ), ('endtag' , 'a' )]
361+ # check charrefs in the middle of the text
362+ expected = [('starttag' , 'a' , []), ('data' , 'a"z' ), ('endtag' , 'a' )]
352363 for charref in charrefs :
353- self ._run_check ('<a href="foo{0}zar" >a{0}z</a>' .format (charref ),
364+ self ._run_check ('<a>a{0}z</a>' .format (charref ),
354365 expected , collector = collector ())
355- # check charrefs at the beginning/end of the text/attributes
356- expected = [('data' , '"' ),
357- ('starttag' , 'a' , [('x' , '"' ), ('y' , '"X' ), ('z' , 'X"' )]),
366+ # check charrefs at the beginning/end of the text
367+ expected = [('data' , '"' ), ('starttag' , 'a' , []),
358368 ('data' , '"' ), ('endtag' , 'a' ), ('data' , '"' )]
359369 for charref in charrefs :
360- self ._run_check ('{0}<a x="{0}" y="{0}X" z="X{0}" >'
370+ self ._run_check ('{0}<a>'
361371 '{0}</a>{0}' .format (charref ),
362372 expected , collector = collector ())
363373 # check charrefs in <script>/<style> elements
@@ -380,6 +390,35 @@ def test_convert_charrefs(self):
380390 self ._run_check ('no charrefs here' , [('data' , 'no charrefs here' )],
381391 collector = collector ())
382392
393+ def test_convert_charrefs_in_attribute_values (self ):
394+ # default value for convert_charrefs is now True
395+ collector = lambda : EventCollectorCharrefs ()
396+ self .assertTrue (collector ().convert_charrefs )
397+
398+ # always unescape terminated entity refs, numeric and hex char refs:
399+ # - regardless whether they are at start, middle, end of attribute
400+ # - or followed by alphanumeric, non-alphanumeric, or equals char
401+ charrefs = ['¢' , '¢' , '¢' , '¢' , '¢' ]
402+ expected = [('starttag' , 'a' ,
403+ [('x' , '¢' ), ('x' , 'z¢' ), ('x' , '¢z' ),
404+ ('x' , 'z¢z' ), ('x' , '¢ z' ), ('x' , '¢=z' )]),
405+ ('endtag' , 'a' )]
406+ for charref in charrefs :
407+ self ._run_check ('<a x="{0}" x="z{0}" x="{0}z" '
408+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
409+ .format (charref ), expected , collector = collector ())
410+
411+ # only unescape unterminated entity matches if they are not followed by
412+ # an alphanumeric or an equals sign
413+ charref = '¢'
414+ expected = [('starttag' , 'a' ,
415+ [('x' , '¢' ), ('x' , 'z¢' ), ('x' , '¢z' ),
416+ ('x' , 'z¢z' ), ('x' , '¢ z' ), ('x' , '¢=z' )]),
417+ ('endtag' , 'a' )]
418+ self ._run_check ('<a x="{0}" x="z{0}" x="{0}z" '
419+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
420+ .format (charref ), expected , collector = collector ())
421+
383422 # the remaining tests were for the "tolerant" parser (which is now
384423 # the default), and check various kind of broken markup
385424 def test_tolerant_parsing (self ):
@@ -537,52 +576,99 @@ def test_EOF_in_charref(self):
537576 for html , expected in data :
538577 self ._run_check (html , expected )
539578
540- def test_broken_comments (self ):
579+ def test_EOF_in_comments_or_decls (self ):
580+ data = [
581+ ('<!' , [('data' , '<!' )]),
582+ ('<!-' , [('data' , '<!-' )]),
583+ ('<!--' , [('data' , '<!--' )]),
584+ ('<![' , [('data' , '<![' )]),
585+ ('<![CDATA[' , [('data' , '<![CDATA[' )]),
586+ ('<![CDATA[x' , [('data' , '<![CDATA[x' )]),
587+ ('<!DOCTYPE' , [('data' , '<!DOCTYPE' )]),
588+ ('<!DOCTYPE HTML' , [('data' , '<!DOCTYPE HTML' )]),
589+ ]
590+ for html , expected in data :
591+ self ._run_check (html , expected )
592+ def test_bogus_comments (self ):
541593 html = ('<! not really a comment >'
542594 '<! not a comment either -->'
543595 '<! -- close enough -->'
544596 '<!><!<-- this was an empty comment>'
545- '<!!! another bogus comment !!!>' )
597+ '<!!! another bogus comment !!!>'
598+ # see #32876
599+ '<![with square brackets]!>'
600+ '<![\n multiline\n bogusness\n ]!>'
601+ '<![more brackets]-[and a hyphen]!>'
602+ '<![cdata[should be uppercase]]>'
603+ '<![CDATA [whitespaces are not ignored]]>'
604+ '<![CDATA]]>' # required '[' after CDATA
605+ )
546606 expected = [
547607 ('comment' , ' not really a comment ' ),
548608 ('comment' , ' not a comment either --' ),
549609 ('comment' , ' -- close enough --' ),
550610 ('comment' , '' ),
551611 ('comment' , '<-- this was an empty comment' ),
552612 ('comment' , '!! another bogus comment !!!' ),
613+ ('comment' , '[with square brackets]!' ),
614+ ('comment' , '[\n multiline\n bogusness\n ]!' ),
615+ ('comment' , '[more brackets]-[and a hyphen]!' ),
616+ ('comment' , '[cdata[should be uppercase]]' ),
617+ ('comment' , '[CDATA [whitespaces are not ignored]]' ),
618+ ('comment' , '[CDATA]]' ),
553619 ]
554620 self ._run_check (html , expected )
555621
556622 def test_broken_condcoms (self ):
557623 # these condcoms are missing the '--' after '<!' and before the '>'
624+ # and they are considered bogus comments according to
625+ # "8.2.4.42. Markup declaration open state"
558626 html = ('<![if !(IE)]>broken condcom<![endif]>'
559627 '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
560628 '<![if !IE 6]><img src="firefox.png" /><![endif]>'
561629 '<![if !ie 6]><b>foo</b><![endif]>'
562630 '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>' )
563- # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
564- # and "8.2.4.45 Markup declaration open state", comment tokens should
565- # be emitted instead of 'unknown decl', but calling unknown_decl
566- # provides more flexibility.
567- # See also Lib/_markupbase.py:parse_declaration
568631 expected = [
569- ('unknown decl ' , 'if !(IE)' ),
632+ ('comment ' , '[ if !(IE)] ' ),
570633 ('data' , 'broken condcom' ),
571- ('unknown decl ' , 'endif' ),
572- ('unknown decl ' , 'if ! IE' ),
634+ ('comment ' , '[ endif] ' ),
635+ ('comment ' , '[ if ! IE] ' ),
573636 ('startendtag' , 'link' , [('href' , 'favicon.tiff' )]),
574- ('unknown decl ' , 'endif' ),
575- ('unknown decl ' , 'if !IE 6' ),
637+ ('comment ' , '[ endif] ' ),
638+ ('comment ' , '[ if !IE 6] ' ),
576639 ('startendtag' , 'img' , [('src' , 'firefox.png' )]),
577- ('unknown decl ' , 'endif' ),
578- ('unknown decl ' , 'if !ie 6' ),
640+ ('comment ' , '[ endif] ' ),
641+ ('comment ' , '[ if !ie 6] ' ),
579642 ('starttag' , 'b' , []),
580643 ('data' , 'foo' ),
581644 ('endtag' , 'b' ),
582- ('unknown decl ' , 'endif' ),
583- ('unknown decl ' , 'if (!IE)|(lt IE 9)' ),
645+ ('comment ' , '[ endif] ' ),
646+ ('comment ' , '[ if (!IE)|(lt IE 9)] ' ),
584647 ('startendtag' , 'img' , [('src' , 'mammoth.bmp' )]),
585- ('unknown decl' , 'endif' )
648+ ('comment' , '[endif]' )
649+ ]
650+ self ._run_check (html , expected )
651+
652+ def test_cdata_declarations (self ):
653+ # More tests should be added. See also "8.2.4.42. Markup
654+ # declaration open state", "8.2.4.69. CDATA section state",
655+ # and issue 32876
656+ html = ('<![CDATA[just some plain text]]>' )
657+ expected = [('unknown decl' , 'CDATA[just some plain text' )]
658+ self ._run_check (html , expected )
659+
660+ def test_cdata_declarations_multiline (self ):
661+ html = ('<code><![CDATA['
662+ ' if (a < b && a > b) {'
663+ ' printf("[<marquee>How?</marquee>]");'
664+ ' }'
665+ ']]></code>' )
666+ expected = [
667+ ('starttag' , 'code' , []),
668+ ('unknown decl' ,
669+ 'CDATA[ if (a < b && a > b) { '
670+ 'printf("[<marquee>How?</marquee>]"); }' ),
671+ ('endtag' , 'code' )
586672 ]
587673 self ._run_check (html , expected )
588674
@@ -787,5 +873,17 @@ def test_weird_chars_in_unquoted_attribute_values(self):
787873 ('starttag' , 'form' ,
788874 [('action' , 'bogus|&#()value' )])])
789875
876+
877+ class TestInheritance (unittest .TestCase ):
878+
879+ @patch ("_markupbase.ParserBase.__init__" )
880+ @patch ("_markupbase.ParserBase.reset" )
881+ def test_base_class_methods_called (self , super_reset_method , super_init_method ):
882+ with patch ('_markupbase.ParserBase' ) as parser_base :
883+ EventCollector ()
884+ super_init_method .assert_called_once ()
885+ super_reset_method .assert_called_once ()
886+
887+
790888if __name__ == "__main__" :
791889 unittest .main ()
0 commit comments