-module(html_entity_refs).
-vsn('0.1').
-author('dcaoyuan@gmail.com').
-export([decode_for_xml/1]).
-export([get_xmerl_rules/0]).
-export([decode_for_xml_test_/0,
xmerl_rules_test_/0]).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% @spec EncodedHtml::string() -> string()
%% @doc decode html entity references to utf-8 chars, except xml special entity refs:
%% """ "&" "<" ">"
%%
%%
decode_for_xml(EncodedHtml) -> decode_for_xml(EncodedHtml, []).
decode_for_xml([], Decoded) -> lists:reverse(Decoded);
decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#160, Decoded)); %% no-break space = non-breaking space, U+00A0 ISOnum -->
decode_for_xml("¡" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#161, Decoded)); %% inverted exclamation mark, U+00A1 ISOnum -->
decode_for_xml("¢" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#162, Decoded)); %% cent sign, U+00A2 ISOnum -->
decode_for_xml("£" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#163, Decoded)); %% pound sign, U+00A3 ISOnum -->
decode_for_xml("¤" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#164, Decoded)); %% currency sign, U+00A4 ISOnum -->
decode_for_xml("¥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#165, Decoded)); %% yen sign = yuan sign, U+00A5 ISOnum -->
decode_for_xml("¦" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#166, Decoded)); %% broken bar = broken vertical bar,
decode_for_xml("§" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#167, Decoded)); %% section sign, U+00A7 ISOnum -->
decode_for_xml("¨" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#168, Decoded)); %% diaeresis = spacing diaeresis,
decode_for_xml("©" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#169, Decoded)); %% copyright sign, U+00A9 ISOnum -->
decode_for_xml("ª" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#170, Decoded)); %% feminine ordinal indicator, U+00AA ISOnum -->
decode_for_xml("«" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#171, Decoded)); %% left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
decode_for_xml("¬" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#172, Decoded)); %% not sign, U+00AC ISOnum -->
decode_for_xml("" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#173, Decoded)); %% soft hyphen = discretionary hyphen, U+00AD ISOnum -->
decode_for_xml("®" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#174, Decoded)); %% registered sign = registered trade mark sign, U+00AE ISOnum -->
decode_for_xml("¯" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#175, Decoded)); %% macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
decode_for_xml("°" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#176, Decoded)); %% degree sign, U+00B0 ISOnum -->
decode_for_xml("±" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#177, Decoded)); %% plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
decode_for_xml("²" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#178, Decoded)); %% superscript two = superscript digit two = squared, U+00B2 ISOnum -->
decode_for_xml("³" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#179, Decoded)); %% superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
decode_for_xml("´" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#180, Decoded)); %% acute accent = spacing acute, U+00B4 ISOdia -->
decode_for_xml("µ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#181, Decoded)); %% micro sign, U+00B5 ISOnum -->
decode_for_xml("¶" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#182, Decoded)); %% pilcrow sign = paragraph sign, U+00B6 ISOnum -->
decode_for_xml("·" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#183, Decoded)); %% middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
decode_for_xml("¸" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#184, Decoded)); %% cedilla = spacing cedilla, U+00B8 ISOdia -->
decode_for_xml("¹" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#185, Decoded)); %% superscript one = superscript digit one, U+00B9 ISOnum -->
decode_for_xml("º" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#186, Decoded)); %% masculine ordinal indicator, U+00BA ISOnum -->
decode_for_xml("»" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#187, Decoded)); %% right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
decode_for_xml("¼" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#188, Decoded)); %% vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
decode_for_xml("½" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#189, Decoded)); %% vulgar fraction one half = fraction one half, U+00BD ISOnum -->
decode_for_xml("¾" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#190, Decoded)); %% vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
decode_for_xml("¿" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#191, Decoded)); %% inverted question mark = turned question mark, U+00BF ISOnum -->
decode_for_xml("À" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#192, Decoded)); %% latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 -->
decode_for_xml("Á" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#193, Decoded)); %% latin capital letter A with acute, U+00C1 ISOlat1 -->
decode_for_xml("Â" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#194, Decoded)); %% latin capital letter A with circumflex, U+00C2 ISOlat1 -->
decode_for_xml("Ã" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#195, Decoded)); %% latin capital letter A with tilde, U+00C3 ISOlat1 -->
decode_for_xml("Ä" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#196, Decoded)); %% latin capital letter A with diaeresis, U+00C4 ISOlat1 -->
decode_for_xml("Å" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#197, Decoded)); %% latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 -->
decode_for_xml("Æ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#198, Decoded)); %% latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 -->
decode_for_xml("Ç" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#199, Decoded)); %% latin capital letter C with cedilla, U+00C7 ISOlat1 -->
decode_for_xml("È" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#200, Decoded)); %% latin capital letter E with grave, U+00C8 ISOlat1 -->
decode_for_xml("É" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#201, Decoded)); %% latin capital letter E with acute, U+00C9 ISOlat1 -->
decode_for_xml("Ê" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#202, Decoded)); %% latin capital letter E with circumflex, U+00CA ISOlat1 -->
decode_for_xml("Ë" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#203, Decoded)); %% latin capital letter E with diaeresis, U+00CB ISOlat1 -->
decode_for_xml("Ì" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#204, Decoded)); %% latin capital letter I with grave, U+00CC ISOlat1 -->
decode_for_xml("Í" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#205, Decoded)); %% latin capital letter I with acute, U+00CD ISOlat1 -->
decode_for_xml("Î" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#206, Decoded)); %% latin capital letter I with circumflex, U+00CE ISOlat1 -->
decode_for_xml("Ï" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#207, Decoded)); %% latin capital letter I with diaeresis, U+00CF ISOlat1 -->
decode_for_xml("Ð" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#208, Decoded)); %% latin capital letter ETH, U+00D0 ISOlat1 -->
decode_for_xml("Ñ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#209, Decoded)); %% latin capital letter N with tilde, U+00D1 ISOlat1 -->
decode_for_xml("Ò" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#210, Decoded)); %% latin capital letter O with grave, U+00D2 ISOlat1 -->
decode_for_xml("Ó" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#211, Decoded)); %% latin capital letter O with acute, U+00D3 ISOlat1 -->
decode_for_xml("Ô" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#212, Decoded)); %% latin capital letter O with circumflex, U+00D4 ISOlat1 -->
decode_for_xml("Õ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#213, Decoded)); %% latin capital letter O with tilde, U+00D5 ISOlat1 -->
decode_for_xml("Ö" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#214, Decoded)); %% latin capital letter O with diaeresis, U+00D6 ISOlat1 -->
decode_for_xml("×" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#215, Decoded)); %% multiplication sign, U+00D7 ISOnum -->
decode_for_xml("Ø" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#216, Decoded)); %% latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 -->
decode_for_xml("Ù" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#217, Decoded)); %% latin capital letter U with grave, U+00D9 ISOlat1 -->
decode_for_xml("Ú" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#218, Decoded)); %% latin capital letter U with acute, U+00DA ISOlat1 -->
decode_for_xml("Û" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#219, Decoded)); %% latin capital letter U with circumflex, U+00DB ISOlat1 -->
decode_for_xml("Ü" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#220, Decoded)); %% latin capital letter U with diaeresis, U+00DC ISOlat1 -->
decode_for_xml("Ý" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#221, Decoded)); %% latin capital letter Y with acute, U+00DD ISOlat1 -->
decode_for_xml("Þ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#222, Decoded)); %% latin capital letter THORN, U+00DE ISOlat1 -->
decode_for_xml("ß" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#223, Decoded)); %% latin small letter sharp s = ess-zed, U+00DF ISOlat1 -->
decode_for_xml("à" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#224, Decoded)); %% latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 -->
decode_for_xml("á" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#225, Decoded)); %% latin small letter a with acute, U+00E1 ISOlat1 -->
decode_for_xml("â" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#226, Decoded)); %% latin small letter a with circumflex, U+00E2 ISOlat1 -->
decode_for_xml("ã" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#227, Decoded)); %% latin small letter a with tilde, U+00E3 ISOlat1 -->
decode_for_xml("ä" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#228, Decoded)); %% latin small letter a with diaeresis, U+00E4 ISOlat1 -->
decode_for_xml("å" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#229, Decoded)); %% latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 -->
decode_for_xml("æ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#230, Decoded)); %% latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 -->
decode_for_xml("ç" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#231, Decoded)); %% latin small letter c with cedilla, U+00E7 ISOlat1 -->
decode_for_xml("è" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#232, Decoded)); %% latin small letter e with grave, U+00E8 ISOlat1 -->
decode_for_xml("é" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#233, Decoded)); %% latin small letter e with acute, U+00E9 ISOlat1 -->
decode_for_xml("ê" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#234, Decoded)); %% latin small letter e with circumflex, U+00EA ISOlat1 -->
decode_for_xml("ë" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#235, Decoded)); %% latin small letter e with diaeresis, U+00EB ISOlat1 -->
decode_for_xml("ì" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#236, Decoded)); %% latin small letter i with grave, U+00EC ISOlat1 -->
decode_for_xml("í" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#237, Decoded)); %% latin small letter i with acute, U+00ED ISOlat1 -->
decode_for_xml("î" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#238, Decoded)); %% latin small letter i with circumflex, U+00EE ISOlat1 -->
decode_for_xml("ï" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#239, Decoded)); %% latin small letter i with diaeresis, U+00EF ISOlat1 -->
decode_for_xml("ð" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#240, Decoded)); %% latin small letter eth, U+00F0 ISOlat1 -->
decode_for_xml("ñ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#241, Decoded)); %% latin small letter n with tilde, U+00F1 ISOlat1 -->
decode_for_xml("ò" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#242, Decoded)); %% latin small letter o with grave, U+00F2 ISOlat1 -->
decode_for_xml("ó" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#243, Decoded)); %% latin small letter o with acute, U+00F3 ISOlat1 -->
decode_for_xml("ô" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#244, Decoded)); %% latin small letter o with circumflex, U+00F4 ISOlat1 -->
decode_for_xml("õ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#245, Decoded)); %% latin small letter o with tilde, U+00F5 ISOlat1 -->
decode_for_xml("ö" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#246, Decoded)); %% latin small letter o with diaeresis, U+00F6 ISOlat1 -->
decode_for_xml("÷" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#247, Decoded)); %% division sign, U+00F7 ISOnum
decode_for_xml("ø" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#248, Decoded)); %% latin small letter o with stroke = latin small letter o slash, U+00F8 ISOlat1 -->
decode_for_xml("ù" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#249, Decoded)); %% latin small letter u with grave, U+00F9 ISOlat1 -->
decode_for_xml("ú" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#250, Decoded)); %% latin small letter u with acute, U+00FA ISOlat1 -->
decode_for_xml("û" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#251, Decoded)); %% latin small letter u with circumflex, U+00FB ISOlat1 -->
decode_for_xml("ü" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#252, Decoded)); %% latin small letter u with diaeresis, U+00FC ISOlat1 -->
decode_for_xml("ý" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#253, Decoded)); %% latin small letter y with acute, U+00FD ISOlat1 -->
decode_for_xml("þ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#254, Decoded)); %% latin small letter thorn, U+00FE ISOlat1 -->
decode_for_xml("ÿ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#255, Decoded)); %% latin small letter y with diaeresis, U+00FF ISOlat1 -->
%% Special characters for HTML -->
%% C0 Controls and Basic Latin -->
%% @notice We should keep these char as it, since xml should encode them as it
%decode_for_xml(""" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#34, Decoded)); %% quotation mark = APL quote, U+0022 ISOnum -->
%decode_for_xml("&" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#38, Decoded)); %% ampersand, U+0026 ISOnum -->
%decode_for_xml("<" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#60, Decoded)); %% less-than sign, U+003C ISOnum -->
%decode_for_xml(">" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#62, Decoded)); %% greater-than sign, U+003E ISOnum -->
%% Latin Extended-A -->
decode_for_xml("Œ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#338, Decoded)); %% latin capital ligature OE, U+0152 ISOlat2 -->
decode_for_xml("œ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#339, Decoded)); %% latin small ligature oe, U+0153 ISOlat2 -->
%% ligature is a misnomer, this is a separate character in some languages -->
decode_for_xml("Š" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#352, Decoded)); %% latin capital letter S with caron, U+0160 ISOlat2 -->
decode_for_xml("š" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#353, Decoded)); %% latin small letter s with caron, U+0161 ISOlat2 -->
decode_for_xml("Ÿ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#376, Decoded)); %% latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
%% Spacing Modifier Letters -->
decode_for_xml("ˆ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#710, Decoded)); %% modifier letter circumflex accent, U+02C6 ISOpub -->
decode_for_xml("˜" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#732, Decoded)); %% small tilde, U+02DC ISOdia -->
%% General Punctuation -->
decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8194, Decoded)); %% en space, U+2002 ISOpub -->
decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8195, Decoded)); %% em space, U+2003 ISOpub -->
decode_for_xml(" " ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8201, Decoded)); %% thin space, U+2009 ISOpub -->
decode_for_xml("" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8204, Decoded)); %% zero width non-joiner, U+200C NEW RFC 2070 -->
decode_for_xml("" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8205, Decoded)); %% zero width joiner, U+200D NEW RFC 2070 -->
decode_for_xml("" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8206, Decoded)); %% left-to-right mark, U+200E NEW RFC 2070 -->
decode_for_xml("" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8207, Decoded)); %% right-to-left mark, U+200F NEW RFC 2070 -->
decode_for_xml("–" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8211, Decoded)); %% en dash, U+2013 ISOpub -->
decode_for_xml("—" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8212, Decoded)); %% em dash, U+2014 ISOpub -->
decode_for_xml("‘" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8216, Decoded)); %% left single quotation mark, U+2018 ISOnum -->
decode_for_xml("’" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8217, Decoded)); %% right single quotation mark, U+2019 ISOnum -->
decode_for_xml("‚" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8218, Decoded)); %% single low-9 quotation mark, U+201A NEW -->
decode_for_xml("“" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8220, Decoded)); %% left double quotation mark, U+201C ISOnum -->
decode_for_xml("”" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8221, Decoded)); %% right double quotation mark, U+201D ISOnum -->
decode_for_xml("„" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8222, Decoded)); %% double low-9 quotation mark, U+201E NEW -->
decode_for_xml("†" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8224, Decoded)); %% dagger, U+2020 ISOpub -->
decode_for_xml("‡" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8225, Decoded)); %% double dagger, U+2021 ISOpub -->
decode_for_xml("‰" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8240, Decoded)); %% per mille sign, U+2030 ISOtech -->
decode_for_xml("‹" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8249, Decoded)); %% single left-pointing angle quotation mark, U+2039 ISO proposed -->
%% lsaquo is proposed but not yet ISO standardized -->
decode_for_xml("›" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8250, Decoded)); %% single right-pointing angle quotation mark, U+203A ISO proposed -->
%% rsaquo is proposed but not yet ISO standardized -->
decode_for_xml("€" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8364, Decoded)); %% euro sign, U+20AC NEW -->
%% Mathematical, Greek and Symbolic characters for HTML -->
%% Latin Extended-B -->
decode_for_xml("ƒ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#402, Decoded)); %% latin small f with hook = function = florin, U+0192 ISOtech -->
%% Greek -->
decode_for_xml("Α" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#913, Decoded)); %% greek capital letter alpha, U+0391 -->
decode_for_xml("Β" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#914, Decoded)); %% greek capital letter beta, U+0392 -->
decode_for_xml("Γ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#915, Decoded)); %% greek capital letter gamma, U+0393 ISOgrk3 -->
decode_for_xml("Δ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#916, Decoded)); %% greek capital letter delta, U+0394 ISOgrk3 -->
decode_for_xml("Ε" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#917, Decoded)); %% greek capital letter epsilon, U+0395 -->
decode_for_xml("Ζ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#918, Decoded)); %% greek capital letter zeta, U+0396 -->
decode_for_xml("Η" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#919, Decoded)); %% greek capital letter eta, U+0397 -->
decode_for_xml("Θ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#920, Decoded)); %% greek capital letter theta, U+0398 ISOgrk3 -->
decode_for_xml("Ι" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#921, Decoded)); %% greek capital letter iota, U+0399 -->
decode_for_xml("Κ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#922, Decoded)); %% greek capital letter kappa, U+039A -->
decode_for_xml("Λ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#923, Decoded)); %% greek capital letter lambda, U+039B ISOgrk3 -->
decode_for_xml("Μ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#924, Decoded)); %% greek capital letter mu, U+039C -->
decode_for_xml("Ν" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#925, Decoded)); %% greek capital letter nu, U+039D -->
decode_for_xml("Ξ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#926, Decoded)); %% greek capital letter xi, U+039E ISOgrk3 -->
decode_for_xml("Ο" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#927, Decoded)); %% greek capital letter omicron, U+039F -->
decode_for_xml("Π" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#928, Decoded)); %% greek capital letter pi, U+03A0 ISOgrk3 -->
decode_for_xml("Ρ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#929, Decoded)); %% greek capital letter rho, U+03A1 -->
%% there is no Sigmaf, and no U+03A2 character either -->
decode_for_xml("Σ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#931, Decoded)); %% greek capital letter sigma, U+03A3 ISOgrk3 -->
decode_for_xml("Τ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#932, Decoded)); %% greek capital letter tau, U+03A4 -->
decode_for_xml("Υ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#933, Decoded)); %% greek capital letter upsilon, U+03A5 ISOgrk3 -->
decode_for_xml("Φ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#934, Decoded)); %% greek capital letter phi, U+03A6 ISOgrk3 -->
decode_for_xml("Χ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#935, Decoded)); %% greek capital letter chi, U+03A7 -->
decode_for_xml("Ψ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#936, Decoded)); %% greek capital letter psi, U+03A8 ISOgrk3 -->
decode_for_xml("Ω" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#937, Decoded)); %% greek capital letter omega, U+03A9 ISOgrk3 -->
decode_for_xml("α" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#945, Decoded)); %% greek small letter alpha, U+03B1 ISOgrk3 -->
decode_for_xml("β" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#946, Decoded)); %% greek small letter beta, U+03B2 ISOgrk3 -->
decode_for_xml("γ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#947, Decoded)); %% greek small letter gamma, U+03B3 ISOgrk3 -->
decode_for_xml("δ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#948, Decoded)); %% greek small letter delta, U+03B4 ISOgrk3 -->
decode_for_xml("ε" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#949, Decoded)); %% greek small letter epsilon, U+03B5 ISOgrk3 -->
decode_for_xml("ζ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#950, Decoded)); %% greek small letter zeta, U+03B6 ISOgrk3 -->
decode_for_xml("η" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#951, Decoded)); %% greek small letter eta, U+03B7 ISOgrk3 -->
decode_for_xml("θ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#952, Decoded)); %% greek small letter theta, U+03B8 ISOgrk3 -->
decode_for_xml("ι" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#953, Decoded)); %% greek small letter iota, U+03B9 ISOgrk3 -->
decode_for_xml("κ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#954, Decoded)); %% greek small letter kappa, U+03BA ISOgrk3 -->
decode_for_xml("λ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#955, Decoded)); %% greek small letter lambda, U+03BB ISOgrk3 -->
decode_for_xml("μ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#956, Decoded)); %% greek small letter mu, U+03BC ISOgrk3 -->
decode_for_xml("ν" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#957, Decoded)); %% greek small letter nu, U+03BD ISOgrk3 -->
decode_for_xml("ξ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#958, Decoded)); %% greek small letter xi, U+03BE ISOgrk3 -->
decode_for_xml("ο" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#959, Decoded)); %% greek small letter omicron, U+03BF NEW -->
decode_for_xml("π" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#960, Decoded)); %% greek small letter pi, U+03C0 ISOgrk3 -->
decode_for_xml("ρ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#961, Decoded)); %% greek small letter rho, U+03C1 ISOgrk3 -->
decode_for_xml("ς" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#962, Decoded)); %% greek small letter final sigma, U+03C2 ISOgrk3 -->
decode_for_xml("σ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#963, Decoded)); %% greek small letter sigma, U+03C3 ISOgrk3 -->
decode_for_xml("τ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#964, Decoded)); %% greek small letter tau, U+03C4 ISOgrk3 -->
decode_for_xml("υ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#965, Decoded)); %% greek small letter upsilon, U+03C5 ISOgrk3 -->
decode_for_xml("φ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#966, Decoded)); %% greek small letter phi, U+03C6 ISOgrk3 -->
decode_for_xml("χ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#967, Decoded)); %% greek small letter chi, U+03C7 ISOgrk3 -->
decode_for_xml("ψ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#968, Decoded)); %% greek small letter psi, U+03C8 ISOgrk3 -->
decode_for_xml("ω" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#969, Decoded)); %% greek small letter omega, U+03C9 ISOgrk3 -->
decode_for_xml("ϑ"++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#977, Decoded)); %% greek small letter theta symbol, U+03D1 NEW -->
decode_for_xml("ϒ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#978, Decoded)); %% greek upsilon with hook symbol, U+03D2 NEW -->
decode_for_xml("ϖ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#982, Decoded)); %% greek pi symbol, U+03D6 ISOgrk3 -->
%% General Punctuation -->
decode_for_xml("•" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8226, Decoded)); %% bullet = black small circle, U+2022 ISOpub -->
%% bullet is NOT the same as bullet operator, U+2219 -->
decode_for_xml("…" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8230, Decoded)); %% horizontal ellipsis = three dot leader, U+2026 ISOpub -->
decode_for_xml("′" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8242, Decoded)); %% prime = minutes = feet, U+2032 ISOtech -->
decode_for_xml("″" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8243, Decoded)); %% double prime = seconds = inches, U+2033 ISOtech -->
decode_for_xml("‾" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8254, Decoded)); %% overline = spacing overscore, U+203E NEW -->
decode_for_xml("⁄" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8260, Decoded)); %% fraction slash, U+2044 NEW -->
%% Letterlike Symbols -->
decode_for_xml("℘" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8472, Decoded)); %% script capital P = power set = Weierstrass p, U+2118 ISOamso -->
decode_for_xml("ℑ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8465, Decoded)); %% blackletter capital I = imaginary part, U+2111 ISOamso -->
decode_for_xml("ℜ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8476, Decoded)); %% blackletter capital R = real part symbol, U+211C ISOamso -->
decode_for_xml("™" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8482, Decoded)); %% trade mark sign, U+2122 ISOnum -->
decode_for_xml("ℵ" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8501, Decoded)); %% alef symbol = first transfinite cardinal, U+2135 NEW -->
%% alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters -->
%% Arrows -->
decode_for_xml("←" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8592, Decoded)); %% leftwards arrow, U+2190 ISOnum -->
decode_for_xml("↑" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8593, Decoded)); %% upwards arrow, U+2191 ISOnum-->
decode_for_xml("→" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8594, Decoded)); %% rightwards arrow, U+2192 ISOnum -->
decode_for_xml("↓" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8595, Decoded)); %% downwards arrow, U+2193 ISOnum -->
decode_for_xml("↔" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8596, Decoded)); %% left right arrow, U+2194 ISOamsa -->
decode_for_xml("↵" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8629, Decoded)); %% downwards arrow with corner leftwards = carriage return, U+21B5 NEW -->
decode_for_xml("⇐" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8656, Decoded)); %% leftwards double arrow, U+21D0 ISOtech -->
%% ISO 10646 does not say that lArr is the same as the 'is implied by' arrow but also does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests -->
decode_for_xml("⇑" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8657, Decoded)); %% upwards double arrow, U+21D1 ISOamsa -->
decode_for_xml("⇒" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8658, Decoded)); %% rightwards double arrow, U+21D2 ISOtech -->
%% ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ? rArr can be used for 'implies' as ISOtech suggests -->
decode_for_xml("⇓" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8659, Decoded)); %% downwards double arrow, U+21D3 ISOamsa -->
decode_for_xml("⇔" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8660, Decoded)); %% left right double arrow, U+21D4 ISOamsa -->
%% Mathematical Operators -->
decode_for_xml("∀" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8704, Decoded)); %% for all, U+2200 ISOtech -->
decode_for_xml("∂" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8706, Decoded)); %% partial differential, U+2202 ISOtech -->
decode_for_xml("∃" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8707, Decoded)); %% there exists, U+2203 ISOtech -->
decode_for_xml("∅" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8709, Decoded)); %% empty set = null set = diameter, U+2205 ISOamso -->
decode_for_xml("∇" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8711, Decoded)); %% nabla = backward difference, U+2207 ISOtech -->
decode_for_xml("∈" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8712, Decoded)); %% element of, U+2208 ISOtech -->
decode_for_xml("∉" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8713, Decoded)); %% not an element of, U+2209 ISOtech -->
decode_for_xml("∋" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8715, Decoded)); %% contains as member, U+220B ISOtech -->
%% should there be a more memorable name than 'ni'? -->
decode_for_xml("∏" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8719, Decoded)); %% n-ary product = product sign, U+220F ISOamsb -->
%% prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both -->
decode_for_xml("∑" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8721, Decoded)); %% n-ary sumation, U+2211 ISOamsb -->
%% sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both -->
decode_for_xml("−" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8722, Decoded)); %% minus sign, U+2212 ISOtech -->
decode_for_xml("∗" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8727, Decoded)); %% asterisk operator, U+2217 ISOtech -->
decode_for_xml("√" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8730, Decoded)); %% square root = radical sign, U+221A ISOtech -->
decode_for_xml("∝" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8733, Decoded)); %% proportional to, U+221D ISOtech -->
decode_for_xml("∞" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8734, Decoded)); %% infinity, U+221E ISOtech -->
decode_for_xml("∠" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8736, Decoded)); %% angle, U+2220 ISOamso -->
decode_for_xml("∧" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8743, Decoded)); %% logical and = wedge, U+2227 ISOtech -->
decode_for_xml("∨" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8744, Decoded)); %% logical or = vee, U+2228 ISOtech -->
decode_for_xml("∩" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8745, Decoded)); %% intersection = cap, U+2229 ISOtech -->
decode_for_xml("∪" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8746, Decoded)); %% union = cup, U+222A ISOtech -->
decode_for_xml("∫" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8747, Decoded)); %% integral, U+222B ISOtech -->
decode_for_xml("∴" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8756, Decoded)); %% therefore, U+2234 ISOtech -->
decode_for_xml("∼" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8764, Decoded)); %% tilde operator = varies with = similar to, U+223C ISOtech -->
%% tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both -->
decode_for_xml("≅" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8773, Decoded)); %% approximately equal to, U+2245 ISOtech -->
decode_for_xml("≈" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8776, Decoded)); %% almost equal to = asymptotic to, U+2248 ISOamsr -->
decode_for_xml("≠" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8800, Decoded)); %% not equal to, U+2260 ISOtech -->
decode_for_xml("≡" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8801, Decoded)); %% identical to, U+2261 ISOtech -->
decode_for_xml("≤" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8804, Decoded)); %% less-than or equal to, U+2264 ISOtech -->
decode_for_xml("≥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8805, Decoded)); %% greater-than or equal to, U+2265 ISOtech -->
decode_for_xml("⊂" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8834, Decoded)); %% subset of, U+2282 ISOtech -->
decode_for_xml("⊃" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8835, Decoded)); %% superset of, U+2283 ISOtech -->
%% note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry? It is in ISOamsn -->
decode_for_xml("⊄" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8836, Decoded)); %% not a subset of, U+2284 ISOamsn -->
decode_for_xml("⊆" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8838, Decoded)); %% subset of or equal to, U+2286 ISOtech -->
decode_for_xml("⊇" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8839, Decoded)); %% superset of or equal to, U+2287 ISOtech -->
decode_for_xml("⊕" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8853, Decoded)); %% circled plus = direct sum, U+2295 ISOamsb -->
decode_for_xml("⊗" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8855, Decoded)); %% circled times = vector product, U+2297 ISOamsb -->
decode_for_xml("⊥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8869, Decoded)); %% up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
decode_for_xml("⋅" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8901, Decoded)); %% dot operator, U+22C5 ISOamsb -->
%% dot operator is NOT the same character as U+00B7 middle dot -->
%% Miscellaneous Technical -->
decode_for_xml("⌈" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8968, Decoded)); %% left ceiling = apl upstile, U+2308 ISOamsc -->
decode_for_xml("⌉" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8969, Decoded)); %% right ceiling, U+2309 ISOamsc -->
decode_for_xml("⌊" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8970, Decoded)); %% left floor = apl downstile, U+230A ISOamsc -->
decode_for_xml("⌋" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#8971, Decoded)); %% right floor, U+230B ISOamsc -->
decode_for_xml("〈" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9001, Decoded)); %% left-pointing angle bracket = bra, U+2329 ISOtech -->
%% lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' -->
decode_for_xml("〉" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9002, Decoded)); %% right-pointing angle bracket = ket, U+232A ISOtech -->
%% rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' -->
%% Geometric Shapes -->
decode_for_xml("◊" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9674, Decoded)); %% lozenge, U+25CA ISOpub -->
%% Miscellaneous Symbols -->
decode_for_xml("♠" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9824, Decoded)); %% black spade suit, U+2660 ISOpub -->
%% black here seems to mean filled as opposed to hollow -->
decode_for_xml("♣" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9827, Decoded)); %% black club suit = shamrock, U+2663 ISOpub -->
decode_for_xml("♥" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9829, Decoded)); %% black heart suit = valentine, U+2665 ISOpub -->
decode_for_xml("♦" ++T, Decoded) -> decode_for_xml(T, concat_as_utf8(10#9830, Decoded)); %% black diamond suit, U+2666 ISOpub -->
decode_for_xml([H|T], Decoded) -> decode_for_xml(T, [H|Decoded]).
concat_as_utf8(C, Decoded) when integer(C), C >= 0 ->
if C < 128 ->
%% 0yyyyyyy
[C|Decoded];
C < 16#800 ->
%% 110xxxxy 10yyyyyy
B1 = 16#C0 + (C bsr 6),
B2 = 128 + (C band 16#3F),
[B2|[B1|Decoded]];
C < 16#10000 ->
%% 1110xxxx 10xyyyyy 10yyyyyy
if C < 16#D800; C > 16#DFFF, C < 16#FFFE ->
B1 = 16#E0 + (C bsr 12),
B2 = 128 + ((C bsr 6) band 16#3F),
B3 = 128 + (C band 16#3F),
[B3|[B2|[B1|Decoded]]]
end;
C < 16#200000 ->
%% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
B1 = 16#F0 + (C bsr 18),
B2 = 128 + ((C bsr 12) band 16#3F),
B3 = 128 + ((C bsr 6) band 16#3F),
B4 = 128 + (C band 16#3F),
[B4|[B3|[B2|[B1|Decoded]]]];
C < 16#4000000 ->
%% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy
B1 = 16#F8 + (C bsr 24),
B2 = 128 + ((C bsr 18) band 16#3F),
B3 = 128 + ((C bsr 12) band 16#3F),
B4 = 128 + ((C bsr 6) band 16#3F),
B5 = 128 + (C band 16#3F),
[B5|[B4|[B3|[B2|[B1|Decoded]]]]];
C < 16#80000000 ->
%% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy
B1 = 16#FC + (C bsr 30),
B2 = 128 + ((C bsr 24) band 16#3F),
B3 = 128 + ((C bsr 18) band 16#3F),
B4 = 128 + ((C bsr 12) band 16#3F),
B5 = 128 + ((C bsr 6) band 16#3F),
B6 = 128 + (C band 16#3F),
[B6|[B5|[B4|[B3|[B2|[B1|Decoded]]]]]]
end.
%% reference only:
decode(EncodedHtml) -> decode(EncodedHtml, []).
decode(" " ++T, Decoded) -> decode(T, [$\040|Decoded]); % space
decode(""" ++T, Decoded) -> decode(T, [$\042|Decoded]); % "
decode("&" ++T, Decoded) -> decode(T, [$\046|Decoded]); % &
decode("'" ++T, Decoded) -> decode(T, [$\047|Decoded]); % '
decode("<" ++T, Decoded) -> decode(T, [$\074|Decoded]); % <
decode(">" ++T, Decoded) -> decode(T, [$\076|Decoded]); % >
decode("" ++T, Decoded) ->
{Rest, Char}= match_ascii(T, []),
decode(Rest, [Char|Decoded]);
decode([H|T], Decoded) -> decode(T, [H|Decoded]);
decode([], Decoded) -> lists:reverse(Decoded).
%% @spec string() -> integer
match_ascii(";" ++T, Ascii) -> {T, list_to_integer(lists:reverse(Ascii))};
match_ascii([H|T], Ascii) -> match_ascii(T, [H|Ascii]).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% @doc create an ets table that contains html entity refs as xmerl rules
%% Usage: xmerl_scan:string(XmlText, [{rules, get_xmerl_rules}])
%%
-define(TabName, html_xmerl_rules).
-define(CharEntityRefDTD, "
").
get_xmerl_rules() ->
%% The ets table will be deleted if the process that create this ets table quit
%% even the table is created as public.
%% To enable long-living ets table, should create a seperate long-living process,
%% and create ets table from this process.
case ets:info(?TabName) of
undefined ->
init_xmerl_rules();
_ ->
?TabName
end.
init_xmerl_rules() ->
Tab = ets:new(?TabName, [set, public, named_table]),
catch xmerl_scan:string("
",
[{fetch_fun, dtd_fetch_fun()},
{rules, ?TabName}]),
Tab.
dtd_fetch_fun() ->
fun(_, State) ->
{ok, {string, ?CharEntityRefDTD}, State}
end.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% tests
xmerl_rules_test_() ->
{ok, XmlBin} = file:read_file("../pole/data/fodor.xml"),
XmlText = binary_to_list(XmlBin),
Before = now(),
Rules = get_xmerl_rules(),
io:fwrite(user, "Info of rules table: ~p~n", [ets:info(?TabName)]),
{DocElement, _Rest} = xmerl_scan:string(XmlText, [{rules, Rules}]),
ParsingTime = timer:now_diff(now(), Before),
%io:fwrite(user, "DocElement: ~p~n", [DocElement]),
io:fwrite(user, "Timer Parsing: ~B~n", [ParsingTime]).
decode_for_xml_test_() ->
{ok, XmlBin} = file:read_file("../pole/data/fodor.xml"),
XmlText = binary_to_list(XmlBin),
XmlText1 = decode_for_xml(XmlText),
Before = now(),
{DocElement, _Rest} = xmerl_scan:string(XmlText1),
ParsingTime = timer:now_diff(now(), Before),
%io:fwrite(user, "DocElement: ~p~n", [DocElement]),
io:fwrite(user, "Timer Parsing: ~B~n", [ParsingTime]).