In parse_html() I am trying to parse for the text of each element and get the len() of text in each element. I want to generate a script that parses the length of text in each element and when the cumulative text length reaches a set size parameter, it truncates the rest of the text in the document. My problem is in the child.text/tag1.text/tag2.text/tag3.text. len() doesn't seem to be working on these. Is there a way I can pull the numerical length of these text strings?
import sys
import imaplib
import getpass
import email
import email.header
import datetime
from bs4 import BeautifulSoup
import re
from lxml import etree, html
from io import StringIO, BytesIO
from lxml.html.clean import clean_html, Cleaner, word_break
from lxml.etree import HTML
from lxml.html import HtmlMixin
EMAIL_ACCOUNT = "sample@gmai.com"
EMAIL_FOLDER = "INBOX"
def process_mailbox(M):
"""
Do something with emails messages in the folder.
For the sake of this example, print some headers.
"""
rv, data = M.search(None, "ALL")
if rv != 'OK':
print "No messages found!"
return
for num in data[0].split():
rv, data = M.fetch(num, '(RFC822)')
if rv != 'OK':
print "ERROR getting message", num
return
msg = email.message_from_string(data[0][1])
decode = email.header.decode_header(msg['Subject'])[0]
subject = unicode(decode[0])
body = msg.get_payload(decode=True)
print 'Message %s: %s' % (num, subject)
print 'Raw Date:', msg['Date']
print 'Body:', body
if msg.is_multipart():
html = None
print "Checking for html or text"
for part in msg.get_payload():
if part.get_content_charset() is None:
charset = chardet.detect(srt(part))['encoding']
else:
charset = part.get_content_charset()
if part.get_content_type() == 'text/plain':
text = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.txt', 'w')
f.write(text)
f.close
if part.get_content_type() == 'text/html':
html = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.html','w')
f.write(html)
f.close
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
if not os.path.isfile(filename) :
fp = open(filename, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
return 0
if html is None:
return text.strip()
else:
return html.strip()
# Now convert to local date-time
date_tuple = email.utils.parsedate_tz(msg['Date'])
if date_tuple:
local_date = datetime.datetime.fromtimestamp(
email.utils.mktime_tz(date_tuple))
print "Local Date:", \
local_date.strftime("%a, %d %b %Y %H:%M:%S")
def parse_html():
#htmldoc = open('email.html', 'r+')
#doc = htmldoc.read()
VALID_TAGS = ['iframe', 'video', 'o>', 'li', 'sub', 'sup', 'source', 'br', 'h3', 'h4', 'h6', 'hr', 'q', 'mark','wbr', 'audio','strong', 'em', 'p','ul', 'li', 'br', 'blockquote', 'pre', 'del', 'h3', 'body', 'header', 'html', 'title', 'div', 'img', 'a']
parser = etree.HTMLParser()
tree = etree.parse("email.html", parser)
#results = etree.tostring(tree.getroot(), pretty_print=True, method="html")
page = html.tostring(tree)
cleaner = Cleaner(page_structure=False, add_nofollow=True, style=True, links=True, safe_attrs_only=True)
clean_page = cleaner.clean_html(page)
root = tree.getroot()
child = root[0]
print len(root)
children = list(root)
for child in root:
print child.tag
print child.attrib
print child.text
for tag1 in child:
print tag1.tag
print tag1.attrib
print tag1.text
for tag2 in tag1:
print tag2.tag
print tag2.attrib
print tag2.text
for tag3 in tag2:
print tag3.tag
print tag3.attrib
print tag3.text
M = imaplib.IMAP4_SSL('imap.gmail.com')
try:
rv, data = M.login(EMAIL_ACCOUNT, getpass.getpass())
except imaplib.IMAP4.error:
print "LOGIN FAILED!!! "
sys.exit(1)
print rv, data
rv, mailboxes = M.list()
if rv == 'OK':
print "Mailboxes:"
print mailboxes
rv, data = M.select(EMAIL_FOLDER)
if rv == 'OK':
print "Processing mailbox...\n"
process_mailbox(M)
parse_html()
M.close()
else:
print "ERROR: Unable to open mailbox ", rv
M.logout()
This is the error I get when I try to use len()
TypeError: object of type 'NoneType' has no len()
Also, if you know anything about how to do that truncating with lxml.html, I'd appreciate being pointed in the right direction. Thanks.