I have HTML code like this:
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width">
<title>test</title>
</head>
<body>
<h3><a href="#" name='title1'>Title</a></h3>
<div>para1</div>
<div>para2</div>
<div>para3</div>
<h3><a href="#" name='title2'>Title</a></h3>
<div>para4</div>
<div>para5</div>
</body>
</html>
What I want is:
<div>para1</div>
<div>para2</div>
<div>para3</div>
So I want to get the first part in this html. I need to ignore the second part.
for now I just work out this way:
#!/usr/bin/env python
# encoding: utf-8
import unittest
from lxml import etree
class SearchPara(unittest.TestCase):
def setUp(self):
with open('test.html') as f:
self.html = f.read()
def test_parse_html(self):
paras = ''
page = etree.HTML(self.html)
a_ele = page.xpath("//h3/a[@name='title1']/..")
if a_ele is None or len(a_ele) < 1:
return paras
para = a_ele[0].xpath('following-sibling::*[1][name(.) != "h3"]')
while para is not None and len(para) > 0:
print para
paras += etree.tostring(para[0])
para = para[0].xpath('following-sibling::*[1][name(.) != "h3"]')
print paras
def tearDown(self):
pass
if __name__ == "__main__":
unittest.main()
As you can see, this is a little bit complicated, what I want to know if I have beeter way to do this?
`. And since there is no further ``, nothing matches.