偶尔会在数据中看到诸如 #39; 这样的字符,特征如下
以 #开头,中间是一串数字,以;结尾
以 开头,中间一串字符,以;结尾
比如最常见的 nbsp;或者等价的 #160;
浏览器遇到这些转义符,会转义回来,但如何通过代码识别? org.apache.commons.lang.StringEscapeUtils.unescapeHtml提供了很好的说明
遇到上面的第一种情况,中间是数字的,直接将数字(unicode)转为char
遇到第二情况,中间是字符,只能查映射表了,从映射表中找到字符对应的数字再转换为char 看看代码就一目了然了
看看HTML40如何定义的
static { HTML40 = new Entities(); fillWithHtml40Entities(HTML40); static void fillWithHtml40Entities(Entities entities) { entities.addEntities(BASIC_ARRAY); entities.addEntities(ISO8859_1_ARRAY); entities.addEntities(HTML40_ARRAY); }
再看看BASIC_ARRAY、ISO8859_1_ARRAY、HTML40_ARRAY 分别是什么
BASIC_ARRAY
private static final String[][] BASIC_ARRAY = {{ quot , 34 }, // - double-quote { amp , 38 }, // - ampersand { lt , 60 }, // - less-than { gt , 62 }, // - greater-than };
ISO8859_1_ARRAY
static final String[][] ISO8859_1_ARRAY = {{ nbsp , 160 }, // non-breaking space { iexcl , 161 }, // inverted exclamation mark { cent , 162 }, // cent sign { pound , 163 }, // pound sign { curren , 164 }, // currency sign { yen , 165 }, // yen sign = yuan sign { brvbar , 166 }, // broken bar = broken vertical bar { sect , 167 }, // section sign { uml , 168 }, // diaeresis = spacing diaeresis { copy , 169 }, // ? - copyright sign { ordf , 170 }, // feminine ordinal indicator { laquo , 171 }, // left-pointing double angle quotation mark = left pointing guillemet { not , 172 }, // not sign { shy , 173 }, // soft hyphen = discretionary hyphen { reg , 174 }, // ? - registered trademark sign { macr , 175 }, // macron = spacing macron = overline = APL overbar { deg , 176 }, // degree sign { plusmn , 177 }, // plus-minus sign = plus-or-minus sign { sup2 , 178 }, // superscript two = superscript digit two = squared { sup3 , 179 }, // superscript three = superscript digit three = cubed { acute , 180 }, // acute accent = spacing acute { micro , 181 }, // micro sign { para , 182 }, // pilcrow sign = paragraph sign { middot , 183 }, // middle dot = Georgian comma = Greek middle dot { cedil , 184 }, // cedilla = spacing cedilla { sup1 , 185 }, // superscript one = superscript digit one { ordm , 186 }, // masculine ordinal indicator { raquo , 187 }, // right-pointing double angle quotation mark = right pointing guillemet { frac14 , 188 }, // vulgar fraction one quarter = fraction one quarter { frac12 , 189 }, // vulgar fraction one half = fraction one half { frac34 , 190 }, // vulgar fraction three quarters = fraction three quarters { iquest , 191 }, // inverted question mark = turned question mark { Agrave , 192 }, // ? - uppercase A, grave accent { Aacute , 193 }, // ? - uppercase A, acute accent { Acirc , 194 }, // ? - uppercase A, circumflex accent { Atilde , 195 }, // ? - uppercase A, tilde { Auml , 196 }, // ? - uppercase A, umlaut { Aring , 197 }, // ? - uppercase A, ring { AElig , 198 }, // ? - uppercase AE { Ccedil , 199 }, // ? - uppercase C, cedilla { Egrave , 200 }, // ? - uppercase E, grave accent { Eacute , 201 }, // ? - uppercase E, acute accent { Ecirc , 202 }, // ? - uppercase E, circumflex accent { Euml , 203 }, // ? - uppercase E, umlaut { Igrave , 204 }, // ? - uppercase I, grave accent { Iacute , 205 }, // ? - uppercase I, acute accent { Icirc , 206 }, // ? - uppercase I, circumflex accent { Iuml , 207 }, // ? - uppercase I, umlaut { ETH , 208 }, // ? - uppercase Eth, Icelandic { Ntilde , 209 }, // ? - uppercase N, tilde { Ograve , 210 }, // ? - uppercase O, grave accent { Oacute , 211 }, // ? - uppercase O, acute accent { Ocirc , 212 }, // ? - uppercase O, circumflex accent { Otilde , 213 }, // ? - uppercase O, tilde { Ouml , 214 }, // ? - uppercase O, umlaut { times , 215 }, // multiplication sign { Oslash , 216 }, // ? - uppercase O, slash { Ugrave , 217 }, // ? - uppercase U, grave accent { Uacute , 218 }, // ? - uppercase U, acute accent { Ucirc , 219 }, // ? - uppercase U, circumflex accent { Uuml , 220 }, // ? - uppercase U, umlaut { Yacute , 221 }, // ? - uppercase Y, acute accent { THORN , 222 }, // ? - uppercase THORN, Icelandic { szlig , 223 }, // ? - lowercase sharps, German { agrave , 224 }, // ? - lowercase a, grave accent { aacute , 225 }, // ? - lowercase a, acute accent { acirc , 226 }, // ? - lowercase a, circumflex accent { atilde , 227 }, // ? - lowercase a, tilde { auml , 228 }, // ? - lowercase a, umlaut { aring , 229 }, // ? - lowercase a, ring { aelig , 230 }, // ? - lowercase ae { ccedil , 231 }, // ? - lowercase c, cedilla { egrave , 232 }, // ? - lowercase e, grave accent { eacute , 233 }, // ? - lowercase e, acute accent { ecirc , 234 }, // ? - lowercase e, circumflex accent { euml , 235 }, // ? - lowercase e, umlaut { igrave , 236 }, // ? - lowercase i, grave accent { iacute , 237 }, // ? - lowercase i, acute accent { icirc , 238 }, // ? - lowercase i, circumflex accent { iuml , 239 }, // ? - lowercase i, umlaut { eth , 240 }, // ? - lowercase eth, Icelandic { ntilde , 241 }, // ? - lowercase n, tilde { ograve , 242 }, // ? - lowercase o, grave accent { oacute , 243 }, // ? - lowercase o, acute accent { ocirc , 244 }, // ? - lowercase o, circumflex accent { otilde , 245 }, // ? - lowercase o, tilde { ouml , 246 }, // ? - lowercase o, umlaut { divide , 247 }, // division sign { oslash , 248 }, // ? - lowercase o, slash { ugrave , 249 }, // ? - lowercase u, grave accent { uacute , 250 }, // ? - lowercase u, acute accent { ucirc , 251 }, // ? - lowercase u, circumflex accent { uuml , 252 }, // ? - lowercase u, umlaut { yacute , 253 }, // ? - lowercase y, acute accent { thorn , 254 }, // ? - lowercase thorn, Icelandic { yuml , 255 }, // ? - lowercase y, umlaut };
HTML40_ARRAY
static final String[][] HTML40_ARRAY = { // !-- Latin Extended-B -- { fnof , 402 }, // latin small f with hook = function= florin, U+0192 ISOtech -- // !-- Greek -- { Alpha , 913 }, // greek capital letter alpha, U+0391 -- { Beta , 914 }, // greek capital letter beta, U+0392 -- { Gamma , 915 }, // greek capital letter gamma,U+0393 ISOgrk3 -- { Delta , 916 }, // greek capital letter delta,U+0394 ISOgrk3 -- { Epsilon , 917 }, // greek capital letter epsilon, U+0395 -- { Zeta , 918 }, // greek capital letter zeta, U+0396 -- { Eta , 919 }, // greek capital letter eta, U+0397 -- { Theta , 920 }, // greek capital letter theta,U+0398 ISOgrk3 -- { Iota , 921 }, // greek capital letter iota, U+0399 -- { Kappa , 922 }, // greek capital letter kappa, U+039A -- { Lambda , 923 }, // greek capital letter lambda,U+039B ISOgrk3 -- { Mu , 924 }, // greek capital letter mu, U+039C -- { Nu , 925 }, // greek capital letter nu, U+039D -- { Xi , 926 }, // greek capital letter xi, U+039E ISOgrk3 -- { Omicron , 927 }, // greek capital letter omicron, U+039F -- { Pi , 928 }, // greek capital letter pi, U+03A0 ISOgrk3 -- { Rho , 929 }, // greek capital letter rho, U+03A1 -- // !-- there is no Sigmaf, and no U+03A2 character either -- { Sigma , 931 }, // greek capital letter sigma,U+03A3 ISOgrk3 -- { Tau , 932 }, // greek capital letter tau, U+03A4 -- { Upsilon , 933 }, // greek capital letter upsilon,U+03A5 ISOgrk3 -- { Phi , 934 }, // greek capital letter phi,U+03A6 ISOgrk3 -- { Chi , 935 }, // greek capital letter chi, U+03A7 -- { Psi , 936 }, // greek capital letter psi,U+03A8 ISOgrk3 -- { Omega , 937 }, // greek capital letter omega,U+03A9 ISOgrk3 -- { alpha , 945 }, // greek small letter alpha,U+03B1 ISOgrk3 -- { beta , 946 }, // greek small letter beta, U+03B2 ISOgrk3 -- { gamma , 947 }, // greek small letter gamma,U+03B3 ISOgrk3 -- { delta , 948 }, // greek small letter delta,U+03B4 ISOgrk3 -- { epsilon , 949 }, // greek small letter epsilon,U+03B5 ISOgrk3 -- { zeta , 950 }, // greek small letter zeta, U+03B6 ISOgrk3 -- { eta , 951 }, // greek small letter eta, U+03B7 ISOgrk3 -- { theta , 952 }, // greek small letter theta,U+03B8 ISOgrk3 -- { iota , 953 }, // greek small letter iota, U+03B9 ISOgrk3 -- { kappa , 954 }, // greek small letter kappa,U+03BA ISOgrk3 -- { lambda , 955 }, // greek small letter lambda,U+03BB ISOgrk3 -- { mu , 956 }, // greek small letter mu, U+03BC ISOgrk3 -- { nu , 957 }, // greek small letter nu, U+03BD ISOgrk3 -- { xi , 958 }, // greek small letter xi, U+03BE ISOgrk3 -- { omicron , 959 }, // greek small letter omicron, U+03BF NEW -- { pi , 960 }, // greek small letter pi, U+03C0 ISOgrk3 -- { rho , 961 }, // greek small letter rho, U+03C1 ISOgrk3 -- { sigmaf , 962 }, // greek small letter final sigma,U+03C2 ISOgrk3 -- { sigma , 963 }, // greek small letter sigma,U+03C3 ISOgrk3 -- { tau , 964 }, // greek small letter tau, U+03C4 ISOgrk3 -- { upsilon , 965 }, // greek small letter upsilon,U+03C5 ISOgrk3 -- { phi , 966 }, // greek small letter phi, U+03C6 ISOgrk3 -- { chi , 967 }, // greek small letter chi, U+03C7 ISOgrk3 -- { psi , 968 }, // greek small letter psi, U+03C8 ISOgrk3 -- { omega , 969 }, // greek small letter omega,U+03C9 ISOgrk3 -- { thetasym , 977 }, // greek small letter theta symbol,U+03D1 NEW -- { upsih , 978 }, // greek upsilon with hook symbol,U+03D2 NEW -- { piv , 982 }, // greek pi symbol, U+03D6 ISOgrk3 -- // !-- General Punctuation -- { bull , 8226 }, // bullet = black small circle,U+2022 ISOpub -- // !-- bullet is NOT the same as bullet operator, U+2219 -- { hellip , 8230 }, // horizontal ellipsis = three dot leader,U+2026 ISOpub -- { prime , 8242 }, // prime = minutes = feet, U+2032 ISOtech -- { Prime , 8243 }, // double prime = seconds = inches,U+2033 ISOtech -- { oline , 8254 }, // overline = spacing overscore,U+203E NEW -- { frasl , 8260 }, // fraction slash, U+2044 NEW -- // !-- Letterlike Symbols -- { weierp , 8472 }, // script capital P = power set= Weierstrass p, U+2118 ISOamso -- { image , 8465 }, // blackletter capital I = imaginary part,U+2111 ISOamso -- { real , 8476 }, // blackletter capital R = real part symbol,U+211C ISOamso -- { trade , 8482 }, // trade mark sign, U+2122 ISOnum -- { alefsym , 8501 }, // alef symbol = first transfinite cardinal,U+2135 NEW -- // !-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the // same glyph could be used to depict both characters -- // !-- Arrows -- { larr , 8592 }, // leftwards arrow, U+2190 ISOnum -- { uarr , 8593 }, // upwards arrow, U+2191 ISOnum-- { rarr , 8594 }, // rightwards arrow, U+2192 ISOnum -- { darr , 8595 }, // downwards arrow, U+2193 ISOnum -- { harr , 8596 }, // left right arrow, U+2194 ISOamsa -- { crarr , 8629 }, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -- { lArr , 8656 }, // leftwards double arrow, U+21D0 ISOtech -- // !-- ISO 10646 does not say that lArr is the same as the is implied by // arrow but also does not have any other character for that function. // So ? lArr canbe used for is implied by as ISOtech suggests -- { uArr , 8657 }, // upwards double arrow, U+21D1 ISOamsa -- { rArr , 8658 }, // rightwards double arrow,U+21D2 ISOtech -- // !-- ISO 10646 does not say this is the implies character but does not // have another character with this function so ?rArr can be used for // implies as ISOtech suggests -- { dArr , 8659 }, // downwards double arrow, U+21D3 ISOamsa -- { hArr , 8660 }, // left right double arrow,U+21D4 ISOamsa -- // !-- Mathematical Operators -- { forall , 8704 }, // for all, U+2200 ISOtech -- { part , 8706 }, // partial differential, U+2202 ISOtech -- { exist , 8707 }, // there exists, U+2203 ISOtech -- { empty , 8709 }, // empty set = null set = diameter,U+2205 ISOamso -- { nabla , 8711 }, // nabla = backward difference,U+2207 ISOtech -- { isin , 8712 }, // element of, U+2208 ISOtech -- { notin , 8713 }, // not an element of, U+2209 ISOtech -- { ni , 8715 }, // contains as member, U+220B ISOtech -- // !-- should there be a more memorable name than ni ? -- { prod , 8719 }, // n-ary product = product sign,U+220F ISOamsb -- // !-- prod is NOT the same character as U+03A0 greek capital letter pi // though the same glyph might be used for both -- { sum , 8721 }, // n-ary summation, U+2211 ISOamsb -- // !-- sum is NOT the same character as U+03A3 greek capital letter sigma // though the same glyph might be used for both -- { minus , 8722 }, // minus sign, U+2212 ISOtech -- { lowast , 8727 }, // asterisk operator, U+2217 ISOtech -- { radic , 8730 }, // square root = radical sign,U+221A ISOtech -- { prop , 8733 }, // proportional to, U+221D ISOtech -- { infin , 8734 }, // infinity, U+221E ISOtech -- { ang , 8736 }, // angle, U+2220 ISOamso -- { and , 8743 }, // logical and = wedge, U+2227 ISOtech -- { or , 8744 }, // logical or = vee, U+2228 ISOtech -- { cap , 8745 }, // intersection = cap, U+2229 ISOtech -- { cup , 8746 }, // union = cup, U+222A ISOtech -- { int , 8747 }, // integral, U+222B ISOtech -- { there4 , 8756 }, // therefore, U+2234 ISOtech -- { sim , 8764 }, // tilde operator = varies with = similar to,U+223C ISOtech -- // !-- tilde operator is NOT the same character as the tilde, U+007E,although // the same glyph might be used to represent both -- { cong , 8773 }, // approximately equal to, U+2245 ISOtech -- { asymp , 8776 }, // almost equal to = asymptotic to,U+2248 ISOamsr -- { ne , 8800 }, // not equal to, U+2260 ISOtech -- { equiv , 8801 }, // identical to, U+2261 ISOtech -- { le , 8804 }, // less-than or equal to, U+2264 ISOtech -- { ge , 8805 }, // greater-than or equal to,U+2265 ISOtech -- { sub , 8834 }, // subset of, U+2282 ISOtech -- { sup , 8835 }, // superset of, U+2283 ISOtech -- // !-- note that nsup, not a superset of, U+2283 is not covered by the // Symbol font encoding and is not included. Should it be, for symmetry? // It is in ISOamsn -- !ENTITY nsub , 8836 }, // not a subset of, U+2284 ISOamsn -- { sube , 8838 }, // subset of or equal to, U+2286 ISOtech -- { supe , 8839 }, // superset of or equal to,U+2287 ISOtech -- { oplus , 8853 }, // circled plus = direct sum,U+2295 ISOamsb -- { otimes , 8855 }, // circled times = vector product,U+2297 ISOamsb -- { perp , 8869 }, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -- { sdot , 8901 }, // dot operator, U+22C5 ISOamsb -- // !-- dot operator is NOT the same character as U+00B7 middle dot -- // !-- Miscellaneous Technical -- { lceil , 8968 }, // left ceiling = apl upstile,U+2308 ISOamsc -- { rceil , 8969 }, // right ceiling, U+2309 ISOamsc -- { lfloor , 8970 }, // left floor = apl downstile,U+230A ISOamsc -- { rfloor , 8971 }, // right floor, U+230B ISOamsc -- { lang , 9001 }, // left-pointing angle bracket = bra,U+2329 ISOtech -- // !-- lang is NOT the same character as U+003C less than or U+2039 single left-pointing angle quotation // mark -- { rang , 9002 }, // right-pointing angle bracket = ket,U+232A ISOtech -- // !-- rang is NOT the same character as U+003E greater than or U+203A // single right-pointing angle quotation mark -- // !-- Geometric Shapes -- { loz , 9674 }, // lozenge, U+25CA ISOpub -- // !-- Miscellaneous Symbols -- { spades , 9824 }, // black spade suit, U+2660 ISOpub -- // !-- black here seems to mean filled as opposed to hollow -- { clubs , 9827 }, // black club suit = shamrock,U+2663 ISOpub -- { hearts , 9829 }, // black heart suit = valentine,U+2665 ISOpub -- { diams , 9830 }, // black diamond suit, U+2666 ISOpub -- // !-- Latin Extended-A -- { OElig , 338 }, // -- latin capital ligature OE,U+0152 ISOlat2 -- { oelig , 339 }, // -- latin small ligature oe, U+0153 ISOlat2 -- // !-- ligature is a misnomer, this is a separate character in some languages -- { Scaron , 352 }, // -- latin capital letter S with caron,U+0160 ISOlat2 -- { scaron , 353 }, // -- latin small letter s with caron,U+0161 ISOlat2 -- { Yuml , 376 }, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -- // !-- Spacing Modifier Letters -- { circ , 710 }, // -- modifier letter circumflex accent,U+02C6 ISOpub -- { tilde , 732 }, // small tilde, U+02DC ISOdia -- // !-- General Punctuation -- { ensp , 8194 }, // en space, U+2002 ISOpub -- { emsp , 8195 }, // em space, U+2003 ISOpub -- { thinsp , 8201 }, // thin space, U+2009 ISOpub -- { zwnj , 8204 }, // zero width non-joiner,U+200C NEW RFC 2070 -- { zwj , 8205 }, // zero width joiner, U+200D NEW RFC 2070 -- { lrm , 8206 }, // left-to-right mark, U+200E NEW RFC 2070 -- { rlm , 8207 }, // right-to-left mark, U+200F NEW RFC 2070 -- { ndash , 8211 }, // en dash, U+2013 ISOpub -- { mdash , 8212 }, // em dash, U+2014 ISOpub -- { lsquo , 8216 }, // left single quotation mark,U+2018 ISOnum -- { rsquo , 8217 }, // right single quotation mark,U+2019 ISOnum -- { sbquo , 8218 }, // single low-9 quotation mark, U+201A NEW -- { ldquo , 8220 }, // left double quotation mark,U+201C ISOnum -- { rdquo , 8221 }, // right double quotation mark,U+201D ISOnum -- { bdquo , 8222 }, // double low-9 quotation mark, U+201E NEW -- { dagger , 8224 }, // dagger, U+2020 ISOpub -- { Dagger , 8225 }, // double dagger, U+2021 ISOpub -- { permil , 8240 }, // per mille sign, U+2030 ISOtech -- { lsaquo , 8249 }, // single left-pointing angle quotation mark,U+2039 ISO proposed -- // !-- lsaquo is proposed but not yet ISO standardized -- { rsaquo , 8250 }, // single right-pointing angle quotation mark,U+203A ISO proposed -- // !-- rsaquo is proposed but not yet ISO standardized -- { euro , 8364 }, // -- euro sign, U+20AC NEW -- };
再扩展下
从前面可以看到转义字符中间的那段数字是unicode,那么 这个 转移字符可以 随便构造了 ,并不限于上面的定义,比如 中的unicode是20013,那么构造一个转移字符 #20013;,经过浏览器的渲染 就变回中了 ,虽然不必这么绕为了显示一个字符,但如果在一些不方便传输特殊字符的场景 就可以派上用途了
相信看了这些案例你已经掌握了方法,更多精彩请关注php 其它相关文章!
相关阅读:
怎样让textarea文本域宽度和高度width及height自动适应
html超级链接a的click事件之后跳转href所指向的地址
以上就是html的转义字符怎样通过代码识别的详细内容,html教程
郑重声明:本文版权归原作者所有,转载文章仅为传播更多信息之目的,如作者信息标记有误,请第一时间联系我们修改或删除,多谢。
新闻热点
疑难解答