Skip to content

Commit 32b72a6

Browse files
authored
Fix reading comments with UTF chars (fixes #238) (#240)
* Fix reading comments with UTF chars (fixes #238) * Fix printable methods to account for UTF chars
1 parent d3d137c commit 32b72a6

File tree

2 files changed

+57
-8
lines changed

2 files changed

+57
-8
lines changed

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+31-8
Original file line numberDiff line numberDiff line change
@@ -2981,8 +2981,8 @@ private void parseComment()
29812981
// implements XML 1.0 Section 2.5 Comments
29822982

29832983
// ASSUMPTION: seen <!-
2984-
char ch = more();
2985-
if ( ch != '-' )
2984+
char cch = more();
2985+
if ( cch != '-' )
29862986
throw new XmlPullParserException( "expected <!-- for comment start", this, null );
29872987
if ( tokenize )
29882988
posStart = pos;
@@ -2999,7 +2999,19 @@ private void parseComment()
29992999
while ( true )
30003000
{
30013001
// scan until it hits -->
3002-
ch = more();
3002+
cch = more();
3003+
int ch;
3004+
char cch2;
3005+
if ( Character.isHighSurrogate( cch ) )
3006+
{
3007+
cch2 = more();
3008+
ch = Character.toCodePoint( cch, cch2 );
3009+
}
3010+
else
3011+
{
3012+
cch2 = 0;
3013+
ch = cch;
3014+
}
30033015
if ( seenDashDash && ch != '>' )
30043016
{
30053017
throw new XmlPullParserException( "in comment after two dashes (--) next character must be >"
@@ -3074,7 +3086,11 @@ else if ( ch == '\n' )
30743086
{
30753087
if ( pcEnd >= pc.length )
30763088
ensurePC( pcEnd );
3077-
pc[pcEnd++] = ch;
3089+
pc[pcEnd++] = cch;
3090+
if ( cch2 != 0 )
3091+
{
3092+
pc[pcEnd++] = cch2;
3093+
}
30783094
}
30793095
normalizedCR = false;
30803096
}
@@ -4153,7 +4169,7 @@ private static boolean isS( char ch )
41534169
// ch != '\u0000' ch < '\uFFFE'
41544170

41554171
// private char printable(char ch) { return ch; }
4156-
private static String printable( char ch )
4172+
private static String printable( int ch )
41574173
{
41584174
if ( ch == '\n' )
41594175
{
@@ -4175,18 +4191,25 @@ else if ( ch == '\'' )
41754191
{
41764192
return "\\u" + Integer.toHexString( ch );
41774193
}
4178-
return "" + ch;
4194+
if ( Character.isBmpCodePoint( ch ) )
4195+
{
4196+
return Character.toString( ( char ) ch );
4197+
}
4198+
else
4199+
{
4200+
return new String( new char[] { Character.highSurrogate( ch ), Character.lowSurrogate( ch ) } );
4201+
}
41794202
}
41804203

41814204
private static String printable( String s )
41824205
{
41834206
if ( s == null )
41844207
return null;
4185-
final int sLen = s.length();
4208+
final int sLen = s.codePointCount(0, s.length());
41864209
StringBuilder buf = new StringBuilder( sLen + 10 );
41874210
for ( int i = 0; i < sLen; ++i )
41884211
{
4189-
buf.append( printable( s.charAt( i ) ) );
4212+
buf.append( printable( s.codePointAt( i ) ) );
41904213
}
41914214
s = buf.toString();
41924215
return s;

src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java

+26
Original file line numberDiff line numberDiff line change
@@ -1511,4 +1511,30 @@ public void testReplacementInPCArrayWithShorterCharArray()
15111511
fail( "should not raise exception: " + e );
15121512
}
15131513
}
1514+
1515+
/**
1516+
* Ensures emoji can be parsed correctly
1517+
*/
1518+
@Test
1519+
public void testUnicode() throws IOException {
1520+
String input = "<project><!--ALL TEH BOMS! \uD83D\uDCA3 --></project>";
1521+
1522+
try
1523+
{
1524+
MXParser parser = new MXParser();
1525+
parser.setInput( new StringReader( input ) );
1526+
1527+
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
1528+
assertEquals( "project", parser.getName() );
1529+
assertEquals( XmlPullParser.COMMENT, parser.nextToken() );
1530+
assertEquals( "ALL TEH BOMS! \uD83D\uDCA3 ", parser.getText() );
1531+
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
1532+
assertEquals( "project", parser.getName() );
1533+
}
1534+
catch ( XmlPullParserException e )
1535+
{
1536+
e.printStackTrace();
1537+
fail( "should not raise exception: " + e );
1538+
}
1539+
}
15141540
}

0 commit comments

Comments
 (0)