Unescape numeric XML Entities with Lua

Question

What's a good implementation for unescaping numeric HTML/XML entities, e.g. 
 and replacing them with the ASCII equivalent?

Expressed as a unit test:

local orig = "It&#039;s the &#34;end&#34; &ok;&#10;"
local fixd = unescape(orig) -- Implement this
assert( fixd == "It's the \"end\" &ok;\n" )

Phrogz · Accepted Answer · 2016-11-14T16:31:43.197

Here's a simple implementation that also handles the core named XML entities:

function unescape(str)
  str = string.gsub( str, '&lt;', '<' )
  str = string.gsub( str, '&gt;', '>' )
  str = string.gsub( str, '&quot;', '"' )
  str = string.gsub( str, '&apos;', "'" )
  str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end )
  str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end )
  str = string.gsub( str, '&amp;', '&' ) -- Be sure to do this after all others
  return str
end

print(unescape("&#34;Hello&quot; &apos;World&#39;")) --> "Hello" 'World'

However, note that this fails for one pathological case: a numeric ampersand entity followed by the text amp;:

print(unescape("Ampersand entity is &#38;amp;")) --> Ampersand entity is &
-- The result should actually be                     Ampersand entity is &amp;

We can fix this edge case by handling all entities at once, but the code gets a good bit uglier:

function unescape(str)
  local map={ ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
  str = string.gsub( str, '(&(#?x?)([%d%a]+);)', function(orig,n,s)
    return (n=='' and map[s])
           or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
           or (n=="#" and tonumber(s)) and string.char(s)
           or orig
  end )
  return str
end

print(unescape("Ampersand entity is &#38;amp;")) --> Ampersand entity is &amp;

Finally, we can unwrap it for a little more speed:

local gsub, char = string.gsub, string.char
local entityMap  = {["lt"]="<",["gt"]=">",["amp"]="&",["quot"]='"',["apos"]="'"}
local entitySwap = function(orig,n,s)
  return (n=='' and entityMap[s])
         or (n=="#" and tonumber(s)) and string.char(s)
         or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
         or orig
end
function unescape(str)
  return (gsub( str, '(&(#?x?)([%d%a]+);)', entitySwap ))
end

Should check `n == ""` for the `entityMap` case (or you could accidentally match ` mp;`) — daurnimator, Nov 10 '16 at 15:27

score 3 · Answer 2 · answered Jul 19 '16 at 13:36

For the handful of programmers who may need to escape accents when downloading french html content, here is a more extensive version of the above function.

local function unescape(str)
    str = string.gsub( str, '&nbsp;', ' ')
    str = string.gsub( str, '&iexcl;', '¡')
    str = string.gsub( str, '&cent;', '¢')
    str = string.gsub( str, '&pound;', '£')
    str = string.gsub( str, '&curren;', '¤')
    str = string.gsub( str, '&yen;', '¥')
    str = string.gsub( str, '&brvbar;', '¦')
    str = string.gsub( str, '&sect;', '§')
    str = string.gsub( str, '&uml;', '¨')
    str = string.gsub( str, '&copy;', '©')
    str = string.gsub( str, '&ordf;', 'ª')
    str = string.gsub( str, '&laquo;', '«')
    str = string.gsub( str, '&not;', '¬')
    str = string.gsub( str, '&shy;', '')
    str = string.gsub( str, '&reg;', '®')
    str = string.gsub( str, '&macr;', '¯')
    str = string.gsub( str, '&deg;', '°')
    str = string.gsub( str, '&plusmn;', '±')
    str = string.gsub( str, '&sup2;', '²')
    str = string.gsub( str, '&sup3;', '³')
    str = string.gsub( str, '&acute;', '´')
    str = string.gsub( str, '&micro;', 'µ')
    str = string.gsub( str, '&para;', '¶')
    str = string.gsub( str, '&middot;', '·')
    str = string.gsub( str, '&cedil;', '¸')
    str = string.gsub( str, '&sup1;', '¹')
    str = string.gsub( str, '&ordm;', 'º')
    str = string.gsub( str, '&raquo;', '»')
    str = string.gsub( str, '&frac14;', '¼')
    str = string.gsub( str, '&frac12;', '½')
    str = string.gsub( str, '&frac34;', '¾')
    str = string.gsub( str, '&iquest;', '¿')
    str = string.gsub( str, '&Agrave;', 'À')
    str = string.gsub( str, '&Aacute;', 'Á')
    str = string.gsub( str, '&Acirc;', 'Â')
    str = string.gsub( str, '&Atilde;', 'Ã')
    str = string.gsub( str, '&Auml;', 'Ä')
    str = string.gsub( str, '&Aring;', 'Å')
    str = string.gsub( str, '&AElig;', 'Æ')
    str = string.gsub( str, '&Ccedil;', 'Ç')
    str = string.gsub( str, '&Egrave;', 'È')
    str = string.gsub( str, '&Eacute;', 'É')
    str = string.gsub( str, '&Ecirc;', 'Ê')
    str = string.gsub( str, '&Euml;', 'Ë')
    str = string.gsub( str, '&Igrave;', 'Ì')
    str = string.gsub( str, '&Iacute;', 'Í')
    str = string.gsub( str, '&Icirc;', 'Î')
    str = string.gsub( str, '&Iuml;', 'Ï')
    str = string.gsub( str, '&ETH;', 'Ð')
    str = string.gsub( str, '&Ntilde;', 'Ñ')
    str = string.gsub( str, '&Ograve;', 'Ò')
    str = string.gsub( str, '&Oacute;', 'Ó')
    str = string.gsub( str, '&Ocirc;', 'Ô')
    str = string.gsub( str, '&Otilde;', 'Õ')
    str = string.gsub( str, '&Ouml;', 'Ö')
    str = string.gsub( str, '&times;', '×')
    str = string.gsub( str, '&Oslash;', 'Ø')
    str = string.gsub( str, '&Ugrave;', 'Ù')
    str = string.gsub( str, '&Uacute;', 'Ú')
    str = string.gsub( str, '&Ucirc;', 'Û')
    str = string.gsub( str, '&Uuml;', 'Ü')
    str = string.gsub( str, '&Yacute;', 'Ý')
    str = string.gsub( str, '&THORN;', 'Þ')
    str = string.gsub( str, '&szlig;', 'ß')
    str = string.gsub( str, '&agrave;', 'à')
    str = string.gsub( str, '&aacute;', 'á')
    str = string.gsub( str, '&acirc;', 'â')
    str = string.gsub( str, '&atilde;', 'ã')
    str = string.gsub( str, '&auml;', 'ä')
    str = string.gsub( str, '&aring;', 'å')
    str = string.gsub( str, '&aelig;', 'æ')
    str = string.gsub( str, '&ccedil;', 'ç')
    str = string.gsub( str, '&egrave;', 'è')
    str = string.gsub( str, '&eacute;', 'é')
    str = string.gsub( str, '&ecirc;', 'ê')
    str = string.gsub( str, '&euml;', 'ë')
    str = string.gsub( str, '&igrave;', 'ì')
    str = string.gsub( str, '&iacute;', 'í')
    str = string.gsub( str, '&icirc;', 'î')
    str = string.gsub( str, '&iuml;', 'ï')
    str = string.gsub( str, '&eth;', 'ð')
    str = string.gsub( str, '&ntilde;', 'ñ')
    str = string.gsub( str, '&ograve;', 'ò')
    str = string.gsub( str, '&oacute;', 'ó')
    str = string.gsub( str, '&ocirc;', 'ô')
    str = string.gsub( str, '&otilde;', 'õ')
    str = string.gsub( str, '&ouml;', 'ö')
    str = string.gsub( str, '&divide;', '÷')
    str = string.gsub( str, '&oslash;', 'ø')
    str = string.gsub( str, '&ugrave;', 'ù')
    str = string.gsub( str, '&uacute;', 'ú')
    str = string.gsub( str, '&ucirc;', 'û')
    str = string.gsub( str, '&uuml;', 'ü')
    str = string.gsub( str, '&yacute;', 'ý')
    str = string.gsub( str, '&thorn;', 'þ')
    str = string.gsub( str, '&yuml;', 'ÿ')
    str = string.gsub( str, '&euro;', '€')
    str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end )
    str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end )
    str = string.gsub( str, '&amp;', '&' ) -- Be sure to do this after all others
    return str
end

Note that technically this list is appropriate for HTML, not XML. But thanks for adding it! — Phrogz, Jul 19 '16 at 13:43

score 0 · Answer 3 · answered Mar 11 '19 at 22:26

0

There's now htmlEntities-for-lua module available via LuaRocks, you should be using it instead. It only uses 3 string.gsub iterations in it's code, so it's faster and more complete.

htmlEntities = require('htmlEntities')

print(htmlEntities.decode(yourString))

Or use it as a drop-in replacement for your current functions:

myUnescape = require('htmlEntities').decode

answered Mar 11 '19 at 22:26

BotOfWar

588
5
14

A great looking library for HTML entities, but inappropriate for XML entities. – Phrogz Mar 12 '19 at 03:29
Thanks, I see the subtle difference between XML and XHTML/HTML now. I will move it into an own question-answer later. I was too focused on Yarillo's answer. – BotOfWar Mar 12 '19 at 08:56

Unescape numeric XML Entities with Lua

3 Answers3