1

Maybe this question was asked before but since I could not find a proper answer, I dare to ask a similar one. My problem is, I have been trying to scrape a Turkish car sale web site which is named 'Sahibinden'. I use jupyter notebook and sublime editors.Once I try to get the data written in a csv file, the Turkish letter changes to different characters. I tried. 'UTF-8 Encoding', '# -- coding: utf-8 --', ISO 8859-9, etc. but I could not solve the problem. The other issue is that Sublime editor does not create the csv file despite I did not have any problem on the jupyter notebook. You will find the csv file output in the image link. If someone can reply me I would appreciate it.

Note: the program works and no problem once I run print command on the editors.

Thanks a lot.

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys 
from selenium.common.exceptions import TimeoutException
import unicodedata


with open ('result1.csv','w') as f:

   f.write('brand, model, year, oil_type, gear, odometer, body, hp, 
            eng_dim, color, warranty, condition, price, safe, 
            in_fea, outs_fea, mul_fea,pai_fea, rep_fea, acklm \n')

chrome_path = r"C:\Users\Mike\Desktop\chromedriver.exe"    

driver = webdriver.Chrome(chrome_path)


def final_page(fn_20):

    for lur in fn_20:

        driver.get(lur)

        brand = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[3]/span''')
        brand = brand.text
        brand = brand.encode("utf-8")
        print (brand)

        model = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[5]/span''')
        model = model.text
        model = model.encode("utf-8")

        print (model)

        year = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[6]/span''')
        year = year.text
        year = year.encode("utf-8")

        print (year)

        oil_type = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[7]/span''')
        oil_type = oil_type.text
        oil_type = oil_type.encode("utf-8")

        print (oil_type)

        gear = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[8]/span''')
        gear = gear.text
        gear = gear.encode("utf-8")

        print (gear)

        odometer = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[9]/span''')
        odometer = odometer.text
        odometer = odometer.encode("utf-8")

        print (odometer)

        body = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[10]/span''')
        body = body.text
        body = body.encode("utf-8")

        print (body)

        hp = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[11]/span''')
        hp = hp.text
        hp = hp.encode("utf-8")

        print (hp)

        eng_dim = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[12]/span''')
        eng_dim = eng_dim.text
        eng_dim = eng_dim.encode("utf-8")

        print (eng_dim)

        color = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[14]/span''')
        color = color.text
        color = color.encode("utf-8")

        print (color)

        warranty = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[15]/span''')
        warranty = warranty.text
        warranty = warranty.encode("utf-8")

        print (warranty)

        condition = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[19]/span''')
        condition = condition.text
        condition = condition.encode("utf-8")

        print (condition)

        price = driver.find_element_by_xpath('''//*[@id="classifiedDetail"]/div[1]/div[2]/div[2]/h3''')
        price = price.text 
        price = price.encode("utf-8")

        print (price)

        safe = ''
        safety1 = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[1]/li[@class='selected']''')

    for ur in safety1:
        ur1 = ur.text
        ur1 = ur1.encode("utf-8")
        safe +=ur1 + ', '

    print (safe)

        in_fea = ''
        in_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[2]/li[@class='selected']''')
        for ins in in_features:
            ins1 = ins.text
            ins1 = ins1.encode("utf-8")
            in_fea += ins1 + ', '

        print (in_fea)

        outs_fea = ''
        out_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[3]/li[@class='selected']''')
        for outs in out_features:
            out1 = outs.text
            out1 = out1.encode("utf-8")
            outs_fea += out1 + ', '

        print (outs_fea)

        mul_fea = ''
        mult_features = driver.find_elements_by_xpath('''//div[@id='classifiedProperties']/ul[4]/li[@class='selected']''')
        for mults in mult_features:
            mul = mults.text
            mul = mul.encode("utf-8")
            mul_fea += mul + ', '

        print (mul_fea)

        pai_fea = ''
        paint = driver.find_elements_by_xpath('''//div[@class='classified-pair custom-area ']/ul[1]/li[@class='selected']''')
        for pai in paint:
            pain = pai.text
            pain = pain.encode("utf-8")
            pai_fea += pain + ', '

        print (pai_fea)

        rep_fea = ''
        replcd = driver.find_elements_by_xpath('''//div[@class='classified-pair custom-area']/ul[2]/li[@class='selected']''')
        for rep in replcd:
            repa = rep.text
            repa = repa.encode("utf-8")
            rep_fea += rep + ', '

        print (rep_fea)

        acklm = driver.find_element_by_xpath('''//div[@id='classified-detail']/div[@class='uiBox'][1]/div[@id='classifiedDescription']''')
        acklm = acklm.text
        acklm = acklm.encode("utf-8")

        print (acklm)

        try:

            with open ('result1.csv', 'a') as f:

                f.write (brand + ',' [enter image description here][1]+ model + ',' + year + ',' + oil_type + ',' + gear + ',' + odometer + ',' + body + ',' + hp + ',' + eng_dim + ',' + color + ',' + warranty + ',' + condition + ',' + price + ',' + safe + ',' + in_fea + ',' + outs_fea + ',' + mul_fea + ',' + pai_fea + ',' + rep_fea + ',' + acklm + '\n')                

        except Exception as e:

            print (e)

driver.close
Mike
  • 185
  • 1
  • 9
  • first check if the generated file "result1.csv", can be converted using notepad++, normally if can be converted by notepad++ then it can be fixed by – forqzy Feb 26 '18 at 00:58
  • Actually what i understand with 'convert' is that once it is opened with notepad, same problem occurs again. Could you please guide me in the next steps – Mike Feb 26 '18 at 01:08
  • please check this https://stackoverflow.com/questions/8898294/convert-utf-8-with-bom-to-utf-8-with-no-bom-in-python – forqzy Feb 26 '18 at 01:11
  • I previously checked this link out but could not figure out how to embed the solutions written over there to my code. The issue is that I do not read any data from a csv but write on it. Could you please provide me the main codes I need to use? – Mike Feb 26 '18 at 01:19
  • This is what I have got: --> 171 with open ("result1.csv", "a", "utf-8-sig") as f: 172 173 f.write (brand + ',' + serial + ',' + model + ',' + year + ',' + oil_type + ',' + gear + ',' + odometer + ',' + body + ',' + hp + ',' + eng_dim + ',' + color + ',' + warranty + ',' + seller + ',' + condition + ',' + price + ',' + safe + ',' + in_fea + ',' + outs_fea + ',' + mul_fea + ',' + pai_fea + ',' + rep_fea + ',' + acklm + '\n') TypeError: an integer is required – Mike Feb 26 '18 at 02:20

1 Answers1

0
import codecs

file = codecs.open("utf_test", "w", "utf-8")
file.write(u'\ufeff')
file.write("test with utf-8")
file.write("字符")
file.close()

or this also works for me

with codecs.open("utf_test", "w", "utf-8-sig") as temp:
    temp.write("this is a utf-test\n")
    temp.write(u"test")
forqzy
  • 389
  • 2
  • 11
  • 170 --> 171 with open ("result1.csv", "a", "utf-8-sig") as f: 172 173 f.write (brand + ',' + serial + ',' + model + ',' + year + ',' + oil_type + ',' + gear + ',' + odometer + ',' + body + ',' + hp + ',' + eng_dim + ',' + color + ',' + warranty + ',' + seller + ',' + condition + ',' + price + ',' + safe + ',' + in_fea + ',' + outs_fea + ',' + mul_fea + ',' + pai_fea + ',' + rep_fea + ',' + acklm + '\n') TypeError: an integer is required – Mike Feb 26 '18 at 02:21
  • Unfortunately I received 'ascii' codec can't decode byte 0xc4 in position 41: ordinal not in range(128) error – Mike Feb 26 '18 at 02:37