What's the proper way to remove entity escapes from a string?

J

John Nagle

I have XML replies in a DOM which contain entity escapes,
like "&". What's the proper way to replace them with
the ordinary characters? Preferably something that will work in
most browsers?

I know about ".innerText", but that's not portable; some
browsers convert escapes when reading from innerText and some
don't.

John Nagle
 
E

Elegie

John Nagle wrote:

Hi,
I have XML replies in a DOM which contain entity escapes,
like "&". What's the proper way to replace them with
the ordinary characters? Preferably something that will work in
most browsers?

It's best to let the browser do the translation, and get back the
translated value.

An HTML entity is used to represent some character: the HTML parser
makes the conversion while building the DOM tree, after which the entity
is completely gone, the DOM tree being an object, made of typed nodes.
Therefore, a sound approach to solve your issue is to pass the entity to
the parser (using the innerHTML method, for instance) and get back the
value directly from the DOM tree (using DOM methods).

Tested IE7, FF2, O9. Returns the expanded string, or the empty string if
not supported.

---
<script type="text/javascript">
String.prototype.expandHtmlEntities=function(){
var div=null;
var result="";
if(document.createElement) {
div=document.createElement("div");
if(
typeof div.innerHTML!="undefined" &&
typeof div.firstChild!="undefined"
){
result=this.valueOf().replace(
/&[a-z0-9#]+;/gi,
function(a){
return div.innerHTML=a, div.firstChild.nodeValue;
}
);
}
}
return result;
}

alert("Test : &lt;€&gt;".expandHtmlEntities());
</script>
 
B

Bart Van der Donck

John said:
I have XML replies in a DOM which contain entity escapes,
like "&amp;". What's the proper way to replace them with
the ordinary characters? Preferably something that will work
in most browsers?

Valid XML knows only 5 (default) character entities: 'quot', 'amp',
'apos', 'lt' and 'gt'.
So:

var ent = new Object()
ent['quot'] = 34
ent['amp'] = 38
ent['apos'] = 39
ent['lt'] = 60
ent['gt'] = 62
var xml = '<root>a &lt; &apos;</root>'
for (var i in ent)
xml = xml.replace(new RegExp('&'+i+';','gi'),
String.fromCharCode(ent));
xml = xml.replace(/(&)(#)(\d{1,})(;)/g,
function (tot,amp,cr,cp,sem) {
return String.fromCharCode(cp)
}
)
alert(xml)

You can use more, but those need to be declared in your XML's DTD in
order to remain valid.

In practice this border is often vague. I would suggest to add HTML4
character entities as well to play broad:

var ent = new Object()
ent['apos'] = 39 // from XML, not present in HTML4
ent['quot'] = 34
ent['amp'] = 38
ent['lt'] = 60
ent['gt'] = 62
ent['nbsp'] = 160
ent['iexcl'] = 161
ent['cent'] = 162
ent['pound'] = 163
ent['curren'] = 164
ent['yen'] = 165
ent['brvbar'] = 166
ent['sect'] = 167
ent['uml'] = 168
ent['copy'] = 169
ent['ordf'] = 170
ent['laquo'] = 171
ent['not'] = 172
ent['shy'] = 173
ent['reg'] = 174
ent['macr'] = 175
ent['deg'] = 176
ent['plusmn'] = 177
ent['sup2'] = 178
ent['sup3'] = 179
ent['acute'] = 180
ent['micro'] = 181
ent['para'] = 182
ent['middot'] = 183
ent['cedil'] = 184
ent['sup1'] = 185
ent['ordm'] = 186
ent['raquo'] = 187
ent['frac14'] = 188
ent['frac12'] = 189
ent['frac34'] = 190
ent['iquest'] = 191
ent['Agrave'] = 192
ent['Aacute'] = 193
ent['Acirc'] = 194
ent['Atilde'] = 195
ent['Auml'] = 196
ent['Aring'] = 197
ent['AElig'] = 198
ent['Ccedil'] = 199
ent['Egrave'] = 200
ent['Eacute'] = 201
ent['Ecirc'] = 202
ent['Euml'] = 203
ent['Igrave'] = 204
ent['Iacute'] = 205
ent['Icirc'] = 206
ent['Iuml'] = 207
ent['ETH'] = 208
ent['Ntilde'] = 209
ent['Ograve'] = 210
ent['Oacute'] = 211
ent['Ocirc'] = 212
ent['Otilde'] = 213
ent['Ouml'] = 214
ent['times'] = 215
ent['Oslash'] = 216
ent['Ugrave'] = 217
ent['Uacute'] = 218
ent['Ucirc'] = 219
ent['Uuml'] = 220
ent['Yacute'] = 221
ent['THORN'] = 222
ent['szlig'] = 223
ent['agrave'] = 224
ent['aacute'] = 225
ent['acirc'] = 226
ent['atilde'] = 227
ent['auml'] = 228
ent['aring'] = 229
ent['aelig'] = 230
ent['ccedil'] = 231
ent['egrave'] = 232
ent['eacute'] = 233
ent['ecirc'] = 234
ent['euml'] = 235
ent['igrave'] = 236
ent['iacute'] = 237
ent['icirc'] = 238
ent['iuml'] = 239
ent['eth'] = 240
ent['ntilde'] = 241
ent['ograve'] = 242
ent['oacute'] = 243
ent['ocirc'] = 244
ent['otilde'] = 245
ent['ouml'] = 246
ent['divide'] = 247
ent['oslash'] = 248
ent['ugrave'] = 249
ent['uacute'] = 250
ent['ucirc'] = 251
ent['uuml'] = 252
ent['yacute'] = 253
ent['thorn'] = 254
ent['yuml'] = 255
ent['OElig'] = 338
ent['oelig'] = 339
ent['Scaron'] = 352
ent['scaron'] = 353
ent['Yuml'] = 376
ent['fnof'] = 402
ent['circ'] = 710
ent['tilde'] = 732
ent['Alpha'] = 913
ent['Beta'] = 914
ent['Gamma'] = 915
ent['Delta'] = 916
ent['Epsilon'] = 917
ent['Zeta'] = 918
ent['Eta'] = 919
ent['Theta'] = 920
ent['Iota'] = 921
ent['Kappa'] = 922
ent['Lambda'] = 923
ent['Mu'] = 924
ent['Nu'] = 925
ent['Xi'] = 926
ent['Omicron'] = 927
ent['Pi'] = 928
ent['Rho'] = 929
ent['Sigma'] = 931
ent['Tau'] = 932
ent['Upsilon'] = 933
ent['Phi'] = 934
ent['Chi'] = 935
ent['Psi'] = 936
ent['Omega'] = 937
ent['alpha'] = 945
ent['beta'] = 946
ent['gamma'] = 947
ent['delta'] = 948
ent['epsilon'] = 949
ent['zeta'] = 950
ent['eta'] = 951
ent['theta'] = 952
ent['iota'] = 953
ent['kappa'] = 954
ent['lambda'] = 955
ent['mu'] = 956
ent['nu'] = 957
ent['xi'] = 958
ent['omicron'] = 959
ent['pi'] = 960
ent['rho'] = 961
ent['sigmaf'] = 962
ent['sigma'] = 963
ent['tau'] = 964
ent['upsilon'] = 965
ent['phi'] = 966
ent['chi'] = 967
ent['psi'] = 968
ent['omega'] = 969
ent['thetasym'] = 977
ent['upsih'] = 978
ent['piv'] = 982
ent['ensp'] = 8194
ent['emsp'] = 8195
ent['thinsp'] = 8201
ent['zwnj'] = 8204
ent['zwj'] = 8205
ent['lrm'] = 8206
ent['rlm'] = 8207
ent['ndash'] = 8211
ent['mdash'] = 8212
ent['lsquo'] = 8216
ent['rsquo'] = 8217
ent['sbquo'] = 8218
ent['ldquo'] = 8220
ent['rdquo'] = 8221
ent['bdquo'] = 8222
ent['dagger'] = 8224
ent['Dagger'] = 8225
ent['bull'] = 8226
ent['hellip'] = 8230
ent['permil'] = 8240
ent['prime'] = 8242
ent['Prime'] = 8243
ent['lsaquo'] = 8249
ent['rsaquo'] = 8250
ent['oline'] = 8254
ent['frasl'] = 8260
ent['euro'] = 8364
ent['image'] = 8465
ent['weierp'] = 8472
ent['real'] = 8476
ent['trade'] = 8482
ent['alefsym'] = 8501
ent['larr'] = 8592
ent['uarr'] = 8593
ent['rarr'] = 8594
ent['darr'] = 8595
ent['harr'] = 8596
ent['crarr'] = 8629
ent['lArr'] = 8656
ent['uArr'] = 8657
ent['rArr'] = 8658
ent['dArr'] = 8659
ent['hArr'] = 8660
ent['forall'] = 8704
ent['part'] = 8706
ent['exist'] = 8707
ent['empty'] = 8709
ent['nabla'] = 8711
ent['isin'] = 8712
ent['notin'] = 8713
ent['ni'] = 8715
ent['prod'] = 8719
ent['sum'] = 8721
ent['minus'] = 8722
ent['lowast'] = 8727
ent['radic'] = 8730
ent['prop'] = 8733
ent['infin'] = 8734
ent['ang'] = 8736
ent['and'] = 8743
ent['or'] = 8744
ent['cap'] = 8745
ent['cup'] = 8746
ent['int'] = 8747
ent['there4'] = 8756
ent['sim'] = 8764
ent['cong'] = 8773
ent['asymp'] = 8776
ent['ne'] = 8800
ent['equiv'] = 8801
ent['le'] = 8804
ent['ge'] = 8805
ent['sub'] = 8834
ent['sup'] = 8835
ent['nsub'] = 8836
ent['sube'] = 8838
ent['supe'] = 8839
ent['oplus'] = 8853
ent['otimes'] = 8855
ent['perp'] = 8869
ent['sdot'] = 8901
ent['lceil'] = 8968
ent['rceil'] = 8969
ent['lfloor'] = 8970
ent['rfloor'] = 8971
ent['lang'] = 9001
ent['rang'] = 9002
ent['loz'] = 9674
ent['spades'] = 9824
ent['clubs'] = 9827
ent['hearts'] = 9829
ent['diams'] = 9830

var xml = '<root>&loz; &lt; &AMP; &lt; a ॥</root>'
for (var i in ent)
xml = xml.replace(new RegExp('&'+i+';','gi'),
String.fromCharCode(ent));
xml = xml.replace(/(&)(#)(\d{1,})(;)/g,
function (tot,amp,cr,cp,sem) {
return String.fromCharCode(cp)
}
)
alert(xml)

Info:
http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references

Hope this helps,
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,763
Messages
2,569,562
Members
45,038
Latest member
OrderProperKetocapsules

Latest Threads

Top