@Test public void testEscapeXmlAllCharacters() { // http://www.w3.org/TR/xml/#charsets says: // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, // excluding the surrogate blocks, FFFE, and FFFF. */ final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML .with(NumericEntityEscaper.below(9), NumericEntityEscaper.between(0xB, 0xC), NumericEntityEscaper.between(0xE, 0x19), NumericEntityEscaper.between(0xD800, 0xDFFF), NumericEntityEscaper.between(0xFFFE, 0xFFFF), NumericEntityEscaper.above(0x110000)); assertEquals("�", escapeXml.translate("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008")); assertEquals("\t", escapeXml.translate("\t")); // 0x9 assertEquals("\n", escapeXml.translate("\n")); // 0xA assertEquals("", escapeXml.translate("\u000B\u000C")); assertEquals("\r", escapeXml.translate("\r")); // 0xD assertEquals("Hello World! Ain't this great?", escapeXml.translate("Hello World! Ain't this great?")); assertEquals("", escapeXml.translate("\u000E\u000F\u0018\u0019")); }
/** * Converts special characters to their HTML values. <br> * Example : "�" is converted to "&eacute;" * <p> * * @param string * A String to convert from original to HTML * <p> * @return A String of char converted to HTML equivalent. * */ public static String toHtml(String string) { if (DO_NOTHING) return string; string = StringEscapeUtils.ESCAPE_HTML4.with(NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ).translate(string); if (string != null) { string = string.replaceAll("&", "&"); // To keep same result if // multi-call } return string; }
/** * Converts special characters to their HTML values. <br> * Example : "�" is converted to "&eacute;" * <p> * * @param string * A String to convert from original to HTML * <p> * @return A String of char converted to HTML equivalent. * */ public static String toHtml(String string) { if (DO_NOTHING) return string; string = org.apache.commons.lang3.StringEscapeUtils.ESCAPE_HTML4.with(NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ).translate(string); if (string != null) { string = string.replaceAll("&", "&"); // To keep same result if // multi-call } return string; }
/** * Tests Supplementary characters. * <p> * From http://www.w3.org/International/questions/qa-escapes * </p> * <blockquote> * Supplementary characters are those Unicode characters that have code points higher than the characters in * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect * - you must use the single, code point value for that character. For example, use 𣎴 rather than ��. * </blockquote> * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a> * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a> */ @Test public void testEscapeXmlSupplementaryCharacters() { final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ); assertEquals("Supplementary character must be represented using a single escape", "𣎴", escapeXml.translate("\uD84C\uDFB4")); }
/** * Tests Supplementary characters. * <p> * From http://www.w3.org/International/questions/qa-escapes * </p> * <blockquote> * Supplementary characters are those Unicode characters that have code points higher than the characters in * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect * - you must use the single, code point value for that character. For example, use 𣎴 rather than ��. * </blockquote> * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a> * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a> */ @Test public void testEscapeXmlSupplementaryCharacters() { CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ); assertEquals("Supplementary character must be represented using a single escape", "𣎴", escapeXml.translate("\uD84C\uDFB4")); }
/** * Tests Supplementary characters. * <p> * From http://www.w3.org/International/questions/qa-escapes * </p> * <blockquote> * Supplementary characters are those Unicode characters that have code points higher than the characters in * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect * - you must use the single, code point value for that character. For example, use &#x233B4; rather than * &#xD84C;&#xDFB4;. * </blockquote> * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a> * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a> */ @Test public void testEscapeXmlSupplementaryCharacters() { final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ); assertEquals("Supplementary character must be represented using a single escape", "𣎴", escapeXml.translate("\uD84C\uDFB4")); assertEquals("Supplementary characters mixed with basic characters should be encoded correctly", "a b c 𣎴", escapeXml.translate("a b c \uD84C\uDFB4")); }