0

I want to parse some XML documents that i am getting as strings

import lxml.etree
import re
from lxml.html.soupparser import fromstring,parse

try:
    from bs4 import UnicodeDammit             # BeautifulSoup 4

    def decode_html(html_string):
        converted = UnicodeDammit(html_string)
        if not converted.unicode_markup:
            raise UnicodeDecodeError(
                "Failed to detect encoding, tried [%s]",
                ', '.join(converted.tried_encodings))
        # print converted.original_encoding
        return converted.unicode_markup

except ImportError:
    from BeautifulSoup import UnicodeDammit   # BeautifulSoup 3

    def decode_html(html_string):
        converted = UnicodeDammit(html_string, isHTML=True)
        if not converted.unicode:
            raise UnicodeDecodeError(
                "Failed to detect encoding, tried [%s]",
                ', '.join(converted.triedEncodings))
        # print converted.originalEncoding
        return converted.unicode


def tryMe(inString):

    root = fromstring(decode_html(inString))

    #print tostring(root, pretty_print=True).strip()

    backups = root.xpath(".//p3")
    nodes = root.xpath("./doc/p1/p2/p3[contains(text(),'ABC')]//preceding::p1//p3")

    if not nodes:

        print "No XYZ"
        nodes = root.xpath("./doc/p1/p2/p3[contains(text(),'XYZ')]//preceding::p1//p3") 

        if not nodes:

            print "No ABC"
            return " ".join([re.sub('[\s+]', ' ', para.text.strip()) for para in backups])

        else:

            return " ".join([re.sub('[\s+]', ' ', para.text.strip()) for para in nodes])
    else:
        return " ".join([re.sub('[\s+]', ' ', para.text.strip()) for para in nodes])

Basically i want to look for tag <p3> that has a text of ABC. If this node is found, i will ignore everything that comes after this. Hence the xpath. Else, i look for tag <p3> with text XYZ. If this is found, i ignore everything that comes after this. Otherwise, i just process all the <p3> nodes and return.

This works fine for utf-8 documents but fails for utf-16. for any utf-16 document, i always get an empty string. even though i can see that there are xml nodes of the tag <p3> that have text like ABC and XYZ. I noticed that instead of the expected

<p3>ABC</p3>

the utf-16 document text appears as

&lt;p3&gt;ABC&lt;/p3&gt;

hence the lxml.etree is not able to parse it as proper xml.

how should i solve this?

AbtPst
  • 7,778
  • 17
  • 91
  • 172

0 Answers0