"Cannot use string pattern on bytes like object" when trying to parse htmls for emails

Question

So I have a script I've been working with for a few days trying to get a list of emails from a csv I have, but now I've run into this roadblock. Here is the code:

import sys
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
import re
import csv

list1 = []
list2 = []
list3 = []

def addList():
    with open('file.csv', 'rt') as f:
        reader = csv.reader(f)
        for row in reader:
            for s in row:
                list2.append(s)

def getAddress(url):
    http = "http://"
    https = "https://"

    if http in url:
        return url
    elif https in url:
        return url
    else:
        url = "http://" + url
        return url

def parseAddress(url):
    global list3
    try:
      website = urllib2.urlopen(getAddress(url))
      html = website.read()

      addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)

      global list1
      list1.append(addys)

    except urllib2.HTTPError as err:
        print ("Cannot retrieve URL: HTTP Error Code: "), err.code
        list3.append(url)
    except urllib2.URLError as err:
        print ("Cannot retrive URL: ") + err.reason[1]
        list3.append(url)

def execute():
    global list2
    addList()
    totalNum = len(list2)
    atNum = 1
    for s in list2:
        parseAddress(s)
        print ("Processing ") + str(atNum) + (" out of ") + str(totalNum)
        atNum = atNum + 1

    print ("Completed. Emails parsed: ") + str(len(list1)) + "."


### MAIN

def main():
    global list2
    execute()
    global list1
    myFile = open("finishedFile.csv", "w+")
    wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
    for s in list1:
        wr.writerow(s)
    myFile.close
    global list3
    failFile = open("failedSites.csv", "w+")
    write = csv.writer(failFile, quoting=csv.QUOTE_ALL)
    for j in list3:
        write.writerow(j)
    failFile.close

main()

and when I run it I get this error:

    Traceback (most recent call last):
  File "pagescanner.py", line 85, in <module>
    main()
  File "pagescanner.py", line 71, in main
    execute()
  File "pagescanner.py", line 60, in execute
    parseAddress(s)
  File "pagescanner.py", line 42, in parseAddress
    addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
  File "/usr/lib/python3.5/re.py", line 213, in findall
    return _compile(pattern, flags).findall(string)
TypeError: cannot use a string pattern on a bytes-like object

So I've figured out that I need to figure out how to encode the html string into bytes for the encoding, and Tyler's answer below helped me do so but now I'm getting this error:

Traceback (most recent call last):
  File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
    h.request(req.get_method(), req.selector, req.data, headers)
  File "/usr/lib/python3.5/http/client.py", line 1107, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.5/http/client.py", line 1152, in _send_request
    self.endheaders(body)
  File "/usr/lib/python3.5/http/client.py", line 1103, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
    self.send(msg)
  File "/usr/lib/python3.5/http/client.py", line 877, in send
    self.connect()
  File "/usr/lib/python3.5/http/client.py", line 849, in connect
    (self.host,self.port), self.timeout, self.source_address)
  File "/usr/lib/python3.5/socket.py", line 712, in create_connection
    raise err
  File "/usr/lib/python3.5/socket.py", line 703, in create_connection
    sock.connect(sa)
OSError: [Errno 22] Invalid argument

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "pagescanner.py", line 39, in parseAddress
    website = urllib2.urlopen(getAddress(url))
  File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.5/urllib/request.py", line 466, in open
    response = self._open(req, data)
  File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
    '_open', req)
  File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
    return self.do_open(http.client.HTTPConnection, req)
  File "/usr/lib/python3.5/urllib/request.py", line 1256, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 22] Invalid argument>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "pagescanner.py", line 85, in <module>
    main()
  File "pagescanner.py", line 71, in main
    execute()
  File "pagescanner.py", line 60, in execute
    parseAddress(s)
  File "pagescanner.py", line 51, in parseAddress
    print ("Cannot retrive URL: ") + err.reason[1]
TypeError: 'OSError' object is not subscriptable

Does this mean that one of the urls from the list isn't a valid url? I thought I had finally removed all fo the bad urls from my csv file but I may need to take another look

score 1 · Answer 1 · answered Jan 14 '20 at 19:29

To answer your question, you just need to decode the response properly. Instead of html = website.read() try html = website.read().decode('utf-8')

See Convert bytes to a string

I'll also recommend a couple things that might make your life a little easier. urllib.parse makes dealing with URLs much less of a headache and tends to make things a lot more readable when you inevitably encounter a bug somewhere.

https://docs.python.org/3.5/library/urllib.parse.html

The requests library is also pretty much the gold standard for dealing with HTTP requests and might help solve a bit of the confusion around encoding and other overhead from the standard urllib.request.

https://requests.readthedocs.io/en/master/

And beautifulsoup is a fantastic tool for dealing with HTML.

https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

Thank you! That worked for that issue but now I'm getting another error thats long enough for me to have to post another question. This code will be the death of me :/ — user12692764, Jan 14 '20 at 21:44

"Cannot use string pattern on bytes like object" when trying to parse htmls for emails

1 Answers1