It's pretty easy to write your own HTML entity decoder. Just scan the string looking for &, read up to the following ;, then interpret the results. If it's "amp", "lt", "gt", or "quot", replace it with the relevant character. If it starts with #, it's a numeric entity. If the # is followed by an "x", treat the rest as hexadecimal, otherwise as decimal. Read the number, and then insert the character into your string (if you're writing to an NSMutableString
you can use [str appendFormat:@"%C", thechar]
. NSScanner
can make the string scanning pretty easy, especially since it already knows how to read hex numbers.
I just whipped up a function that should do this for you. Note, I haven't actually tested this, so you should run it through its paces:
- (NSString *)stringByDecodingHTMLEntitiesInString:(NSString *)input {
NSMutableString *results = [NSMutableString string];
NSScanner *scanner = [NSScanner scannerWithString:input];
[scanner setCharactersToBeSkipped:nil];
while (![scanner isAtEnd]) {
NSString *temp;
if ([scanner scanUpToString:@"&" intoString:&temp]) {
[results appendString:temp];
}
if ([scanner scanString:@"&" intoString:NULL]) {
BOOL valid = YES;
unsigned c = 0;
NSUInteger savedLocation = [scanner scanLocation];
if ([scanner scanString:@"#" intoString:NULL]) {
// it's a numeric entity
if ([scanner scanString:@"x" intoString:NULL]) {
// hexadecimal
unsigned int value;
if ([scanner scanHexInt:&value]) {
c = value;
} else {
valid = NO;
}
} else {
// decimal
int value;
if ([scanner scanInt:&value] && value >= 0) {
c = value;
} else {
valid = NO;
}
}
if (![scanner scanString:@";" intoString:NULL]) {
// not ;-terminated, bail out and emit the whole entity
valid = NO;
}
} else {
if (![scanner scanUpToString:@";" intoString:&temp]) {
// &; is not a valid entity
valid = NO;
} else if (![scanner scanString:@";" intoString:NULL]) {
// there was no trailing ;
valid = NO;
} else if ([temp isEqualToString:@"amp"]) {
c = '&';
} else if ([temp isEqualToString:@"quot"]) {
c = '"';
} else if ([temp isEqualToString:@"lt"]) {
c = '<';
} else if ([temp isEqualToString:@"gt"]) {
c = '>';
} else {
// unknown entity
valid = NO;
}
}
if (!valid) {
// we errored, just emit the whole thing raw
[results appendString:[input substringWithRange:NSMakeRange(savedLocation, [scanner scanLocation]-savedLocation)]];
} else {
[results appendFormat:@"%C", c];
}
}
}
return results;
}