-module(html_entity_refs). -vsn('0.1'). -author('dcaoyuan@gmail.com'). -export([decode_for_xml/1]). -export([get_xmerl_rules/0]). -export([decode_for_xml_test_/0, xmerl_rules_test_/0]). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% @spec EncodedHtml::string() -> string() %% @doc decode html entity references to utf-8 chars, except xml special entity refs: %% """ "&" "<" ">" %% %% decode_for_xml(EncodedHtml) -> decode_for_xml(EncodedHtml, []). decode_for_xml([], Decoded) -> lists:reverse(Decoded); decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#160, Decoded)); %% no-break space = non-breaking space, U+00A0 ISOnum --> decode_for_xml("¡" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#161, Decoded)); %% inverted exclamation mark, U+00A1 ISOnum --> decode_for_xml("¢" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#162, Decoded)); %% cent sign, U+00A2 ISOnum --> decode_for_xml("£" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#163, Decoded)); %% pound sign, U+00A3 ISOnum --> decode_for_xml("¤" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#164, Decoded)); %% currency sign, U+00A4 ISOnum --> decode_for_xml("¥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#165, Decoded)); %% yen sign = yuan sign, U+00A5 ISOnum --> decode_for_xml("¦" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#166, Decoded)); %% broken bar = broken vertical bar, decode_for_xml("§" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#167, Decoded)); %% section sign, U+00A7 ISOnum --> decode_for_xml("¨" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#168, Decoded)); %% diaeresis = spacing diaeresis, decode_for_xml("©" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#169, Decoded)); %% copyright sign, U+00A9 ISOnum --> decode_for_xml("ª" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#170, Decoded)); %% feminine ordinal indicator, U+00AA ISOnum --> decode_for_xml("«" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#171, Decoded)); %% left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum --> decode_for_xml("¬" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#172, Decoded)); %% not sign, U+00AC ISOnum --> decode_for_xml("­" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#173, Decoded)); %% soft hyphen = discretionary hyphen, U+00AD ISOnum --> decode_for_xml("®" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#174, Decoded)); %% registered sign = registered trade mark sign, U+00AE ISOnum --> decode_for_xml("¯" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#175, Decoded)); %% macron = spacing macron = overline = APL overbar, U+00AF ISOdia --> decode_for_xml("°" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#176, Decoded)); %% degree sign, U+00B0 ISOnum --> decode_for_xml("±" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#177, Decoded)); %% plus-minus sign = plus-or-minus sign, U+00B1 ISOnum --> decode_for_xml("²" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#178, Decoded)); %% superscript two = superscript digit two = squared, U+00B2 ISOnum --> decode_for_xml("³" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#179, Decoded)); %% superscript three = superscript digit three = cubed, U+00B3 ISOnum --> decode_for_xml("´" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#180, Decoded)); %% acute accent = spacing acute, U+00B4 ISOdia --> decode_for_xml("µ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#181, Decoded)); %% micro sign, U+00B5 ISOnum --> decode_for_xml("¶" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#182, Decoded)); %% pilcrow sign = paragraph sign, U+00B6 ISOnum --> decode_for_xml("·" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#183, Decoded)); %% middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum --> decode_for_xml("¸" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#184, Decoded)); %% cedilla = spacing cedilla, U+00B8 ISOdia --> decode_for_xml("¹" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#185, Decoded)); %% superscript one = superscript digit one, U+00B9 ISOnum --> decode_for_xml("º" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#186, Decoded)); %% masculine ordinal indicator, U+00BA ISOnum --> decode_for_xml("»" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#187, Decoded)); %% right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum --> decode_for_xml("¼" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#188, Decoded)); %% vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum --> decode_for_xml("½" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#189, Decoded)); %% vulgar fraction one half = fraction one half, U+00BD ISOnum --> decode_for_xml("¾" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#190, Decoded)); %% vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum --> decode_for_xml("¿" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#191, Decoded)); %% inverted question mark = turned question mark, U+00BF ISOnum --> decode_for_xml("À" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#192, Decoded)); %% latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 --> decode_for_xml("Á" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#193, Decoded)); %% latin capital letter A with acute, U+00C1 ISOlat1 --> decode_for_xml("Â" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#194, Decoded)); %% latin capital letter A with circumflex, U+00C2 ISOlat1 --> decode_for_xml("Ã" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#195, Decoded)); %% latin capital letter A with tilde, U+00C3 ISOlat1 --> decode_for_xml("Ä" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#196, Decoded)); %% latin capital letter A with diaeresis, U+00C4 ISOlat1 --> decode_for_xml("Å" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#197, Decoded)); %% latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 --> decode_for_xml("Æ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#198, Decoded)); %% latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 --> decode_for_xml("Ç" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#199, Decoded)); %% latin capital letter C with cedilla, U+00C7 ISOlat1 --> decode_for_xml("È" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#200, Decoded)); %% latin capital letter E with grave, U+00C8 ISOlat1 --> decode_for_xml("É" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#201, Decoded)); %% latin capital letter E with acute, U+00C9 ISOlat1 --> decode_for_xml("Ê" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#202, Decoded)); %% latin capital letter E with circumflex, U+00CA ISOlat1 --> decode_for_xml("Ë" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#203, Decoded)); %% latin capital letter E with diaeresis, U+00CB ISOlat1 --> decode_for_xml("Ì" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#204, Decoded)); %% latin capital letter I with grave, U+00CC ISOlat1 --> decode_for_xml("Í" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#205, Decoded)); %% latin capital letter I with acute, U+00CD ISOlat1 --> decode_for_xml("Î" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#206, Decoded)); %% latin capital letter I with circumflex, U+00CE ISOlat1 --> decode_for_xml("Ï" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#207, Decoded)); %% latin capital letter I with diaeresis, U+00CF ISOlat1 --> decode_for_xml("Ð" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#208, Decoded)); %% latin capital letter ETH, U+00D0 ISOlat1 --> decode_for_xml("Ñ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#209, Decoded)); %% latin capital letter N with tilde, U+00D1 ISOlat1 --> decode_for_xml("Ò" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#210, Decoded)); %% latin capital letter O with grave, U+00D2 ISOlat1 --> decode_for_xml("Ó" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#211, Decoded)); %% latin capital letter O with acute, U+00D3 ISOlat1 --> decode_for_xml("Ô" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#212, Decoded)); %% latin capital letter O with circumflex, U+00D4 ISOlat1 --> decode_for_xml("Õ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#213, Decoded)); %% latin capital letter O with tilde, U+00D5 ISOlat1 --> decode_for_xml("Ö" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#214, Decoded)); %% latin capital letter O with diaeresis, U+00D6 ISOlat1 --> decode_for_xml("×" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#215, Decoded)); %% multiplication sign, U+00D7 ISOnum --> decode_for_xml("Ø" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#216, Decoded)); %% latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 --> decode_for_xml("Ù" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#217, Decoded)); %% latin capital letter U with grave, U+00D9 ISOlat1 --> decode_for_xml("Ú" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#218, Decoded)); %% latin capital letter U with acute, U+00DA ISOlat1 --> decode_for_xml("Û" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#219, Decoded)); %% latin capital letter U with circumflex, U+00DB ISOlat1 --> decode_for_xml("Ü" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#220, Decoded)); %% latin capital letter U with diaeresis, U+00DC ISOlat1 --> decode_for_xml("Ý" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#221, Decoded)); %% latin capital letter Y with acute, U+00DD ISOlat1 --> decode_for_xml("Þ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#222, Decoded)); %% latin capital letter THORN, U+00DE ISOlat1 --> decode_for_xml("ß" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#223, Decoded)); %% latin small letter sharp s = ess-zed, U+00DF ISOlat1 --> decode_for_xml("à" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#224, Decoded)); %% latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 --> decode_for_xml("á" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#225, Decoded)); %% latin small letter a with acute, U+00E1 ISOlat1 --> decode_for_xml("â" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#226, Decoded)); %% latin small letter a with circumflex, U+00E2 ISOlat1 --> decode_for_xml("ã" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#227, Decoded)); %% latin small letter a with tilde, U+00E3 ISOlat1 --> decode_for_xml("ä" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#228, Decoded)); %% latin small letter a with diaeresis, U+00E4 ISOlat1 --> decode_for_xml("å" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#229, Decoded)); %% latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 --> decode_for_xml("æ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#230, Decoded)); %% latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 --> decode_for_xml("ç" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#231, Decoded)); %% latin small letter c with cedilla, U+00E7 ISOlat1 --> decode_for_xml("è" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#232, Decoded)); %% latin small letter e with grave, U+00E8 ISOlat1 --> decode_for_xml("é" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#233, Decoded)); %% latin small letter e with acute, U+00E9 ISOlat1 --> decode_for_xml("ê" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#234, Decoded)); %% latin small letter e with circumflex, U+00EA ISOlat1 --> decode_for_xml("ë" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#235, Decoded)); %% latin small letter e with diaeresis, U+00EB ISOlat1 --> decode_for_xml("ì" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#236, Decoded)); %% latin small letter i with grave, U+00EC ISOlat1 --> decode_for_xml("í" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#237, Decoded)); %% latin small letter i with acute, U+00ED ISOlat1 --> decode_for_xml("î" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#238, Decoded)); %% latin small letter i with circumflex, U+00EE ISOlat1 --> decode_for_xml("ï" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#239, Decoded)); %% latin small letter i with diaeresis, U+00EF ISOlat1 --> decode_for_xml("ð" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#240, Decoded)); %% latin small letter eth, U+00F0 ISOlat1 --> decode_for_xml("ñ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#241, Decoded)); %% latin small letter n with tilde, U+00F1 ISOlat1 --> decode_for_xml("ò" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#242, Decoded)); %% latin small letter o with grave, U+00F2 ISOlat1 --> decode_for_xml("ó" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#243, Decoded)); %% latin small letter o with acute, U+00F3 ISOlat1 --> decode_for_xml("ô" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#244, Decoded)); %% latin small letter o with circumflex, U+00F4 ISOlat1 --> decode_for_xml("õ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#245, Decoded)); %% latin small letter o with tilde, U+00F5 ISOlat1 --> decode_for_xml("ö" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#246, Decoded)); %% latin small letter o with diaeresis, U+00F6 ISOlat1 --> decode_for_xml("÷" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#247, Decoded)); %% division sign, U+00F7 ISOnum decode_for_xml("ø" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#248, Decoded)); %% latin small letter o with stroke = latin small letter o slash, U+00F8 ISOlat1 --> decode_for_xml("ù" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#249, Decoded)); %% latin small letter u with grave, U+00F9 ISOlat1 --> decode_for_xml("ú" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#250, Decoded)); %% latin small letter u with acute, U+00FA ISOlat1 --> decode_for_xml("û" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#251, Decoded)); %% latin small letter u with circumflex, U+00FB ISOlat1 --> decode_for_xml("ü" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#252, Decoded)); %% latin small letter u with diaeresis, U+00FC ISOlat1 --> decode_for_xml("ý" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#253, Decoded)); %% latin small letter y with acute, U+00FD ISOlat1 --> decode_for_xml("þ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#254, Decoded)); %% latin small letter thorn, U+00FE ISOlat1 --> decode_for_xml("ÿ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#255, Decoded)); %% latin small letter y with diaeresis, U+00FF ISOlat1 --> %% Special characters for HTML --> %% C0 Controls and Basic Latin --> %% @notice We should keep these char as it, since xml should encode them as it %decode_for_xml(""" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#34, Decoded)); %% quotation mark = APL quote, U+0022 ISOnum --> %decode_for_xml("&" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#38, Decoded)); %% ampersand, U+0026 ISOnum --> %decode_for_xml("<" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#60, Decoded)); %% less-than sign, U+003C ISOnum --> %decode_for_xml(">" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#62, Decoded)); %% greater-than sign, U+003E ISOnum --> %% Latin Extended-A --> decode_for_xml("Œ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#338, Decoded)); %% latin capital ligature OE, U+0152 ISOlat2 --> decode_for_xml("œ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#339, Decoded)); %% latin small ligature oe, U+0153 ISOlat2 --> %% ligature is a misnomer, this is a separate character in some languages --> decode_for_xml("Š" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#352, Decoded)); %% latin capital letter S with caron, U+0160 ISOlat2 --> decode_for_xml("š" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#353, Decoded)); %% latin small letter s with caron, U+0161 ISOlat2 --> decode_for_xml("Ÿ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#376, Decoded)); %% latin capital letter Y with diaeresis, U+0178 ISOlat2 --> %% Spacing Modifier Letters --> decode_for_xml("ˆ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#710, Decoded)); %% modifier letter circumflex accent, U+02C6 ISOpub --> decode_for_xml("˜" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#732, Decoded)); %% small tilde, U+02DC ISOdia --> %% General Punctuation --> decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8194, Decoded)); %% en space, U+2002 ISOpub --> decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8195, Decoded)); %% em space, U+2003 ISOpub --> decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8201, Decoded)); %% thin space, U+2009 ISOpub --> decode_for_xml("‌" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8204, Decoded)); %% zero width non-joiner, U+200C NEW RFC 2070 --> decode_for_xml("‍" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8205, Decoded)); %% zero width joiner, U+200D NEW RFC 2070 --> decode_for_xml("‎" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8206, Decoded)); %% left-to-right mark, U+200E NEW RFC 2070 --> decode_for_xml("‏" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8207, Decoded)); %% right-to-left mark, U+200F NEW RFC 2070 --> decode_for_xml("–" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8211, Decoded)); %% en dash, U+2013 ISOpub --> decode_for_xml("—" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8212, Decoded)); %% em dash, U+2014 ISOpub --> decode_for_xml("‘" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8216, Decoded)); %% left single quotation mark, U+2018 ISOnum --> decode_for_xml("’" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8217, Decoded)); %% right single quotation mark, U+2019 ISOnum --> decode_for_xml("‚" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8218, Decoded)); %% single low-9 quotation mark, U+201A NEW --> decode_for_xml("“" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8220, Decoded)); %% left double quotation mark, U+201C ISOnum --> decode_for_xml("”" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8221, Decoded)); %% right double quotation mark, U+201D ISOnum --> decode_for_xml("„" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8222, Decoded)); %% double low-9 quotation mark, U+201E NEW --> decode_for_xml("†" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8224, Decoded)); %% dagger, U+2020 ISOpub --> decode_for_xml("‡" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8225, Decoded)); %% double dagger, U+2021 ISOpub --> decode_for_xml("‰" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8240, Decoded)); %% per mille sign, U+2030 ISOtech --> decode_for_xml("‹" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8249, Decoded)); %% single left-pointing angle quotation mark, U+2039 ISO proposed --> %% lsaquo is proposed but not yet ISO standardized --> decode_for_xml("›" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8250, Decoded)); %% single right-pointing angle quotation mark, U+203A ISO proposed --> %% rsaquo is proposed but not yet ISO standardized --> decode_for_xml("€" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8364, Decoded)); %% euro sign, U+20AC NEW --> %% Mathematical, Greek and Symbolic characters for HTML --> %% Latin Extended-B --> decode_for_xml("ƒ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#402, Decoded)); %% latin small f with hook = function = florin, U+0192 ISOtech --> %% Greek --> decode_for_xml("Α" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#913, Decoded)); %% greek capital letter alpha, U+0391 --> decode_for_xml("Β" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#914, Decoded)); %% greek capital letter beta, U+0392 --> decode_for_xml("Γ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#915, Decoded)); %% greek capital letter gamma, U+0393 ISOgrk3 --> decode_for_xml("Δ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#916, Decoded)); %% greek capital letter delta, U+0394 ISOgrk3 --> decode_for_xml("Ε" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#917, Decoded)); %% greek capital letter epsilon, U+0395 --> decode_for_xml("Ζ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#918, Decoded)); %% greek capital letter zeta, U+0396 --> decode_for_xml("Η" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#919, Decoded)); %% greek capital letter eta, U+0397 --> decode_for_xml("Θ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#920, Decoded)); %% greek capital letter theta, U+0398 ISOgrk3 --> decode_for_xml("Ι" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#921, Decoded)); %% greek capital letter iota, U+0399 --> decode_for_xml("Κ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#922, Decoded)); %% greek capital letter kappa, U+039A --> decode_for_xml("Λ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#923, Decoded)); %% greek capital letter lambda, U+039B ISOgrk3 --> decode_for_xml("Μ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#924, Decoded)); %% greek capital letter mu, U+039C --> decode_for_xml("Ν" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#925, Decoded)); %% greek capital letter nu, U+039D --> decode_for_xml("Ξ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#926, Decoded)); %% greek capital letter xi, U+039E ISOgrk3 --> decode_for_xml("Ο" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#927, Decoded)); %% greek capital letter omicron, U+039F --> decode_for_xml("Π" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#928, Decoded)); %% greek capital letter pi, U+03A0 ISOgrk3 --> decode_for_xml("Ρ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#929, Decoded)); %% greek capital letter rho, U+03A1 --> %% there is no Sigmaf, and no U+03A2 character either --> decode_for_xml("Σ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#931, Decoded)); %% greek capital letter sigma, U+03A3 ISOgrk3 --> decode_for_xml("Τ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#932, Decoded)); %% greek capital letter tau, U+03A4 --> decode_for_xml("Υ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#933, Decoded)); %% greek capital letter upsilon, U+03A5 ISOgrk3 --> decode_for_xml("Φ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#934, Decoded)); %% greek capital letter phi, U+03A6 ISOgrk3 --> decode_for_xml("Χ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#935, Decoded)); %% greek capital letter chi, U+03A7 --> decode_for_xml("Ψ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#936, Decoded)); %% greek capital letter psi, U+03A8 ISOgrk3 --> decode_for_xml("Ω" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#937, Decoded)); %% greek capital letter omega, U+03A9 ISOgrk3 --> decode_for_xml("α" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#945, Decoded)); %% greek small letter alpha, U+03B1 ISOgrk3 --> decode_for_xml("β" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#946, Decoded)); %% greek small letter beta, U+03B2 ISOgrk3 --> decode_for_xml("γ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#947, Decoded)); %% greek small letter gamma, U+03B3 ISOgrk3 --> decode_for_xml("δ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#948, Decoded)); %% greek small letter delta, U+03B4 ISOgrk3 --> decode_for_xml("ε" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#949, Decoded)); %% greek small letter epsilon, U+03B5 ISOgrk3 --> decode_for_xml("ζ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#950, Decoded)); %% greek small letter zeta, U+03B6 ISOgrk3 --> decode_for_xml("η" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#951, Decoded)); %% greek small letter eta, U+03B7 ISOgrk3 --> decode_for_xml("θ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#952, Decoded)); %% greek small letter theta, U+03B8 ISOgrk3 --> decode_for_xml("ι" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#953, Decoded)); %% greek small letter iota, U+03B9 ISOgrk3 --> decode_for_xml("κ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#954, Decoded)); %% greek small letter kappa, U+03BA ISOgrk3 --> decode_for_xml("λ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#955, Decoded)); %% greek small letter lambda, U+03BB ISOgrk3 --> decode_for_xml("μ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#956, Decoded)); %% greek small letter mu, U+03BC ISOgrk3 --> decode_for_xml("ν" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#957, Decoded)); %% greek small letter nu, U+03BD ISOgrk3 --> decode_for_xml("ξ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#958, Decoded)); %% greek small letter xi, U+03BE ISOgrk3 --> decode_for_xml("ο" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#959, Decoded)); %% greek small letter omicron, U+03BF NEW --> decode_for_xml("π" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#960, Decoded)); %% greek small letter pi, U+03C0 ISOgrk3 --> decode_for_xml("ρ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#961, Decoded)); %% greek small letter rho, U+03C1 ISOgrk3 --> decode_for_xml("ς" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#962, Decoded)); %% greek small letter final sigma, U+03C2 ISOgrk3 --> decode_for_xml("σ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#963, Decoded)); %% greek small letter sigma, U+03C3 ISOgrk3 --> decode_for_xml("τ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#964, Decoded)); %% greek small letter tau, U+03C4 ISOgrk3 --> decode_for_xml("υ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#965, Decoded)); %% greek small letter upsilon, U+03C5 ISOgrk3 --> decode_for_xml("φ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#966, Decoded)); %% greek small letter phi, U+03C6 ISOgrk3 --> decode_for_xml("χ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#967, Decoded)); %% greek small letter chi, U+03C7 ISOgrk3 --> decode_for_xml("ψ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#968, Decoded)); %% greek small letter psi, U+03C8 ISOgrk3 --> decode_for_xml("ω" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#969, Decoded)); %% greek small letter omega, U+03C9 ISOgrk3 --> decode_for_xml("ϑ"++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#977, Decoded)); %% greek small letter theta symbol, U+03D1 NEW --> decode_for_xml("ϒ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#978, Decoded)); %% greek upsilon with hook symbol, U+03D2 NEW --> decode_for_xml("ϖ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#982, Decoded)); %% greek pi symbol, U+03D6 ISOgrk3 --> %% General Punctuation --> decode_for_xml("•" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8226, Decoded)); %% bullet = black small circle, U+2022 ISOpub --> %% bullet is NOT the same as bullet operator, U+2219 --> decode_for_xml("…" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8230, Decoded)); %% horizontal ellipsis = three dot leader, U+2026 ISOpub --> decode_for_xml("′" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8242, Decoded)); %% prime = minutes = feet, U+2032 ISOtech --> decode_for_xml("″" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8243, Decoded)); %% double prime = seconds = inches, U+2033 ISOtech --> decode_for_xml("‾" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8254, Decoded)); %% overline = spacing overscore, U+203E NEW --> decode_for_xml("⁄" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8260, Decoded)); %% fraction slash, U+2044 NEW --> %% Letterlike Symbols --> decode_for_xml("℘" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8472, Decoded)); %% script capital P = power set = Weierstrass p, U+2118 ISOamso --> decode_for_xml("ℑ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8465, Decoded)); %% blackletter capital I = imaginary part, U+2111 ISOamso --> decode_for_xml("ℜ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8476, Decoded)); %% blackletter capital R = real part symbol, U+211C ISOamso --> decode_for_xml("™" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8482, Decoded)); %% trade mark sign, U+2122 ISOnum --> decode_for_xml("ℵ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8501, Decoded)); %% alef symbol = first transfinite cardinal, U+2135 NEW --> %% alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters --> %% Arrows --> decode_for_xml("←" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8592, Decoded)); %% leftwards arrow, U+2190 ISOnum --> decode_for_xml("↑" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8593, Decoded)); %% upwards arrow, U+2191 ISOnum--> decode_for_xml("→" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8594, Decoded)); %% rightwards arrow, U+2192 ISOnum --> decode_for_xml("↓" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8595, Decoded)); %% downwards arrow, U+2193 ISOnum --> decode_for_xml("↔" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8596, Decoded)); %% left right arrow, U+2194 ISOamsa --> decode_for_xml("↵" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8629, Decoded)); %% downwards arrow with corner leftwards = carriage return, U+21B5 NEW --> decode_for_xml("⇐" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8656, Decoded)); %% leftwards double arrow, U+21D0 ISOtech --> %% ISO 10646 does not say that lArr is the same as the 'is implied by' arrow but also does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests --> decode_for_xml("⇑" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8657, Decoded)); %% upwards double arrow, U+21D1 ISOamsa --> decode_for_xml("⇒" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8658, Decoded)); %% rightwards double arrow, U+21D2 ISOtech --> %% ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ? rArr can be used for 'implies' as ISOtech suggests --> decode_for_xml("⇓" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8659, Decoded)); %% downwards double arrow, U+21D3 ISOamsa --> decode_for_xml("⇔" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8660, Decoded)); %% left right double arrow, U+21D4 ISOamsa --> %% Mathematical Operators --> decode_for_xml("∀" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8704, Decoded)); %% for all, U+2200 ISOtech --> decode_for_xml("∂" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8706, Decoded)); %% partial differential, U+2202 ISOtech --> decode_for_xml("∃" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8707, Decoded)); %% there exists, U+2203 ISOtech --> decode_for_xml("∅" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8709, Decoded)); %% empty set = null set = diameter, U+2205 ISOamso --> decode_for_xml("∇" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8711, Decoded)); %% nabla = backward difference, U+2207 ISOtech --> decode_for_xml("∈" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8712, Decoded)); %% element of, U+2208 ISOtech --> decode_for_xml("∉" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8713, Decoded)); %% not an element of, U+2209 ISOtech --> decode_for_xml("∋" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8715, Decoded)); %% contains as member, U+220B ISOtech --> %% should there be a more memorable name than 'ni'? --> decode_for_xml("∏" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8719, Decoded)); %% n-ary product = product sign, U+220F ISOamsb --> %% prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both --> decode_for_xml("∑" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8721, Decoded)); %% n-ary sumation, U+2211 ISOamsb --> %% sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both --> decode_for_xml("−" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8722, Decoded)); %% minus sign, U+2212 ISOtech --> decode_for_xml("∗" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8727, Decoded)); %% asterisk operator, U+2217 ISOtech --> decode_for_xml("√" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8730, Decoded)); %% square root = radical sign, U+221A ISOtech --> decode_for_xml("∝" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8733, Decoded)); %% proportional to, U+221D ISOtech --> decode_for_xml("∞" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8734, Decoded)); %% infinity, U+221E ISOtech --> decode_for_xml("∠" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8736, Decoded)); %% angle, U+2220 ISOamso --> decode_for_xml("∧" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8743, Decoded)); %% logical and = wedge, U+2227 ISOtech --> decode_for_xml("∨" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8744, Decoded)); %% logical or = vee, U+2228 ISOtech --> decode_for_xml("∩" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8745, Decoded)); %% intersection = cap, U+2229 ISOtech --> decode_for_xml("∪" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8746, Decoded)); %% union = cup, U+222A ISOtech --> decode_for_xml("∫" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8747, Decoded)); %% integral, U+222B ISOtech --> decode_for_xml("∴" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8756, Decoded)); %% therefore, U+2234 ISOtech --> decode_for_xml("∼" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8764, Decoded)); %% tilde operator = varies with = similar to, U+223C ISOtech --> %% tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both --> decode_for_xml("≅" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8773, Decoded)); %% approximately equal to, U+2245 ISOtech --> decode_for_xml("≈" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8776, Decoded)); %% almost equal to = asymptotic to, U+2248 ISOamsr --> decode_for_xml("≠" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8800, Decoded)); %% not equal to, U+2260 ISOtech --> decode_for_xml("≡" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8801, Decoded)); %% identical to, U+2261 ISOtech --> decode_for_xml("≤" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8804, Decoded)); %% less-than or equal to, U+2264 ISOtech --> decode_for_xml("≥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8805, Decoded)); %% greater-than or equal to, U+2265 ISOtech --> decode_for_xml("⊂" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8834, Decoded)); %% subset of, U+2282 ISOtech --> decode_for_xml("⊃" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8835, Decoded)); %% superset of, U+2283 ISOtech --> %% note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry? It is in ISOamsn --> decode_for_xml("⊄" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8836, Decoded)); %% not a subset of, U+2284 ISOamsn --> decode_for_xml("⊆" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8838, Decoded)); %% subset of or equal to, U+2286 ISOtech --> decode_for_xml("⊇" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8839, Decoded)); %% superset of or equal to, U+2287 ISOtech --> decode_for_xml("⊕" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8853, Decoded)); %% circled plus = direct sum, U+2295 ISOamsb --> decode_for_xml("⊗" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8855, Decoded)); %% circled times = vector product, U+2297 ISOamsb --> decode_for_xml("⊥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8869, Decoded)); %% up tack = orthogonal to = perpendicular, U+22A5 ISOtech --> decode_for_xml("⋅" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8901, Decoded)); %% dot operator, U+22C5 ISOamsb --> %% dot operator is NOT the same character as U+00B7 middle dot --> %% Miscellaneous Technical --> decode_for_xml("⌈" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8968, Decoded)); %% left ceiling = apl upstile, U+2308 ISOamsc --> decode_for_xml("⌉" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8969, Decoded)); %% right ceiling, U+2309 ISOamsc --> decode_for_xml("⌊" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8970, Decoded)); %% left floor = apl downstile, U+230A ISOamsc --> decode_for_xml("⌋" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8971, Decoded)); %% right floor, U+230B ISOamsc --> decode_for_xml("⟨" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9001, Decoded)); %% left-pointing angle bracket = bra, U+2329 ISOtech --> %% lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' --> decode_for_xml("⟩" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9002, Decoded)); %% right-pointing angle bracket = ket, U+232A ISOtech --> %% rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' --> %% Geometric Shapes --> decode_for_xml("◊" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9674, Decoded)); %% lozenge, U+25CA ISOpub --> %% Miscellaneous Symbols --> decode_for_xml("♠" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9824, Decoded)); %% black spade suit, U+2660 ISOpub --> %% black here seems to mean filled as opposed to hollow --> decode_for_xml("♣" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9827, Decoded)); %% black club suit = shamrock, U+2663 ISOpub --> decode_for_xml("♥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9829, Decoded)); %% black heart suit = valentine, U+2665 ISOpub --> decode_for_xml("♦" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9830, Decoded)); %% black diamond suit, U+2666 ISOpub --> decode_for_xml([H|T], Decoded) -> decode_for_xml(T, [H|Decoded]). concat_as_utf8(C, Decoded) when integer(C), C >= 0 -> if C < 128 -> %% 0yyyyyyy [C|Decoded]; C < 16#800 -> %% 110xxxxy 10yyyyyy B1 = 16#C0 + (C bsr 6), B2 = 128 + (C band 16#3F), [B2|[B1|Decoded]]; C < 16#10000 -> %% 1110xxxx 10xyyyyy 10yyyyyy if C < 16#D800; C > 16#DFFF, C < 16#FFFE -> B1 = 16#E0 + (C bsr 12), B2 = 128 + ((C bsr 6) band 16#3F), B3 = 128 + (C band 16#3F), [B3|[B2|[B1|Decoded]]] end; C < 16#200000 -> %% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy B1 = 16#F0 + (C bsr 18), B2 = 128 + ((C bsr 12) band 16#3F), B3 = 128 + ((C bsr 6) band 16#3F), B4 = 128 + (C band 16#3F), [B4|[B3|[B2|[B1|Decoded]]]]; C < 16#4000000 -> %% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy B1 = 16#F8 + (C bsr 24), B2 = 128 + ((C bsr 18) band 16#3F), B3 = 128 + ((C bsr 12) band 16#3F), B4 = 128 + ((C bsr 6) band 16#3F), B5 = 128 + (C band 16#3F), [B5|[B4|[B3|[B2|[B1|Decoded]]]]]; C < 16#80000000 -> %% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy B1 = 16#FC + (C bsr 30), B2 = 128 + ((C bsr 24) band 16#3F), B3 = 128 + ((C bsr 18) band 16#3F), B4 = 128 + ((C bsr 12) band 16#3F), B5 = 128 + ((C bsr 6) band 16#3F), B6 = 128 + (C band 16#3F), [B6|[B5|[B4|[B3|[B2|[B1|Decoded]]]]]] end. %% reference only: decode(EncodedHtml) -> decode(EncodedHtml, []). decode(" " ++T, Decoded) -> decode(T, [$\040|Decoded]); % space decode(""" ++T, Decoded) -> decode(T, [$\042|Decoded]); % " decode("&" ++T, Decoded) -> decode(T, [$\046|Decoded]); % & decode("'" ++T, Decoded) -> decode(T, [$\047|Decoded]); % ' decode("<" ++T, Decoded) -> decode(T, [$\074|Decoded]); % < decode(">" ++T, Decoded) -> decode(T, [$\076|Decoded]); % > decode("&#" ++T, Decoded) -> {Rest, Char}= match_ascii(T, []), decode(Rest, [Char|Decoded]); decode([H|T], Decoded) -> decode(T, [H|Decoded]); decode([], Decoded) -> lists:reverse(Decoded). %% @spec string() -> integer match_ascii(";" ++T, Ascii) -> {T, list_to_integer(lists:reverse(Ascii))}; match_ascii([H|T], Ascii) -> match_ascii(T, [H|Ascii]). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% @doc create an ets table that contains html entity refs as xmerl rules %% Usage: xmerl_scan:string(XmlText, [{rules, get_xmerl_rules}]) %% -define(TabName, html_xmerl_rules). -define(CharEntityRefDTD, " "). get_xmerl_rules() -> %% The ets table will be deleted if the process that create this ets table quit %% even the table is created as public. %% To enable long-living ets table, should create a seperate long-living process, %% and create ets table from this process. case ets:info(?TabName) of undefined -> init_xmerl_rules(); _ -> ?TabName end. init_xmerl_rules() -> Tab = ets:new(?TabName, [set, public, named_table]), catch xmerl_scan:string(" ", [{fetch_fun, dtd_fetch_fun()}, {rules, ?TabName}]), Tab. dtd_fetch_fun() -> fun(_, State) -> {ok, {string, ?CharEntityRefDTD}, State} end. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% tests xmerl_rules_test_() -> {ok, XmlBin} = file:read_file("../pole/data/fodor.xml"), XmlText = binary_to_list(XmlBin), Before = now(), Rules = get_xmerl_rules(), io:fwrite(user, "Info of rules table: ~p~n", [ets:info(?TabName)]), {DocElement, _Rest} = xmerl_scan:string(XmlText, [{rules, Rules}]), ParsingTime = timer:now_diff(now(), Before), %io:fwrite(user, "DocElement: ~p~n", [DocElement]), io:fwrite(user, "Timer Parsing: ~B~n", [ParsingTime]). decode_for_xml_test_() -> {ok, XmlBin} = file:read_file("../pole/data/fodor.xml"), XmlText = binary_to_list(XmlBin), XmlText1 = decode_for_xml(XmlText), Before = now(), {DocElement, _Rest} = xmerl_scan:string(XmlText1), ParsingTime = timer:now_diff(now(), Before), %io:fwrite(user, "DocElement: ~p~n", [DocElement]), io:fwrite(user, "Timer Parsing: ~B~n", [ParsingTime]).