0

I have some code for a web scraper which exports to Excel. The guy who helped me with it uses 2.7 but I have only been learning for a couple of months and am using 3.5. Would anyone be able to help me adapt this code please?

#import urllib2 as urllib
#import urllib.request
import requests
from bs4 import BeautifulSoup
import datetime
import xlsxwriter
import sys

# Web scraping
def make_soup(url):
    #the_page = urllib.request.urlopen(url)
    res = requests.get(url)
    the_page = res.text
    soup_data = BeautifulSoup(the_page, "html.parser")
    return soup_data


soup = make_soup('http://www.url.co.uk/')


def getNames():
    ret_names_list = ['Names']
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            for td_in_data in data.findAll('td', {"class": "propname"}):
                #import unicodedata
                td_in_data = td_in_data.text
                td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
                #unicodedata.normalize('NFKD', td_in_data).encode('ascii','ignore')

                print(td_in_data)
                ret_names_list.append(td_in_data)
    return ret_names_list


def getRooms():
    ret_rooms_list = ['Rooms']
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            for td_in_data in data.findAll('span', {"class": "beds"}):
                print(td_in_data.text)
                td_in_data = td_in_data.text
                td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
                ret_rooms_list.append(td_in_data)
    return ret_rooms_list


def getRents():
    ret_rents_list = ['Rents']
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            for td_in_data in data.findAll('td', {"class": "rentprice"}):
                print(td_in_data.text)
                td_in_data = td_in_data.text
                td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
                ret_rents_list.append(td_in_data)
    return ret_rents_list

''' To do: get the scraped data to an Excel doc.'''

# Create a workbook and add a worksheet.
if __name__ == '__main__':
    todays_date = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M") )+ '.xlsx'
    todays_date = todays_date.replace(" ", "_").replace(":", "_")

    workbook = xlsxwriter.Workbook(todays_date)
    worksheet = workbook.add_worksheet()

    # Data to Excel.
    excel_dump = zip(getNames(), getRents(), getRooms())

    #Excel_dump = (
    #    ['Name', getNames()],
    #    ['Rent',   getRents()],
    #    ['Rooms',  getRooms()]
    #)

    # Start from the first cell. Rows and columns are zero indexed.
    row = 0
    col = 0


    # Iterate over the data and write it out row by row.
    for name, rent, room in excel_dump:
        try:
            reload(sys)
            sys.setdefaultencoding('Cp1252')
            worksheet.write(col, row, str(name))
            worksheet.write(col+1, row, rent)
            worksheet.write(col+2, row, room)
        except Exception as e:
            raise e
        #col += 1
        row += 1

    workbook.close()

The error messages I am getting for this are:

^Scraped data in memory^ Traceback (most recent call last): File "C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py", line 91, in raise e File "C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py", line 85, in reload(sys) NameError: name 'reload' is not defined Exception ignored in: > Traceback (most recent call last): File "C:\Users\joseph.devlin\AppData\Roaming\Python\Python35\site-packages\xlsxwriter\workbook.py", line 149, in del Exception: Exception caught in workbook destructor. Explicit close() may be required for workbook.

Process finished with exit code 1

I am currently doing my own research as to how to fix this, but any help would be appreciated!

Maverick
  • 789
  • 4
  • 24
  • 45
  • 1
    Please see http://stackoverflow.com/a/28127538/4014959 – PM 2Ring Jan 20 '17 at 15:03
  • Your issue is with `reload(sys)` as indicated in the error message. In Python 3 `reload` is no longer a builtin and is located in the `importlib` module for > 3.4 (for 3.0 to 3.3 the module is called `imp`). – Steven Rumbalski Jan 20 '17 at 15:49
  • 1
    Your next issue will be with the next line `sys.setdefaultencoding('Cp1252')`. `sys.setdefaultencoding()` is not available in Python 3. Doing a `importlib.reload(sys)` doesn't make it come back. The `xlsxwriter` docs [give an example](http://xlsxwriter.readthedocs.io/example_unicode_polish_utf8.html) of how to properly handle non-utf8 encodings. – Steven Rumbalski Jan 20 '17 at 16:10
  • Thank you @StevenRumbalski - that is exactly what happened! I get that I need to use the codecs module in this, but not 100% sure where to put it? In the example they have used when they open the file, I've tried it when creating the workbook and before `sys.setdefaultencoding('Cp1252')` - but no luck so far. – Maverick Jan 20 '17 at 16:22
  • 1
    All of your text should be in Unicode, including the text you get from bs4, you shouldn't need to set any encodings. – Antti Haapala -- Слава Україні Jan 21 '17 at 06:14
  • Any code relying on `sys.setdefaultencoding()` is extremely suspect anyway; you should not rely on implicit encoding and decoding and handle your Unicode values explicitly. Python 3 enforces this already. – Martijn Pieters Jan 21 '17 at 12:19
  • Thank you @AnttiHaapala - I removed encoding and it worked perfectly – Maverick Jan 21 '17 at 12:20

0 Answers0