I am web-scraping 2 table from 2 different sites. I want to append a new column (called WHEREFROM in the header) with a web-scraping text, in my code i called it "name".
My code is here:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import urllib2
import unicodecsv as csv
import os
import sys
import io
import time
import datetime
import pandas as pd
from bs4 import BeautifulSoup
import re
import contextlib
import selenium.webdriver.support.ui as ui
filename=r'output.csv'
resultcsv=open(filename,"wb")
output=csv.writer(resultcsv, delimiter=';',quotechar = '"', quoting=csv.QUOTE_NONNUMERIC, encoding='latin-1')
output.writerow(['TIME','FLIGHT','FROM','AIRLANE','AIRCRAFT','STATUS','WHEREFROM', 'ACTUALDATE'])
def scrape(urls):
browser = webdriver.Firefox()
for url in urls:
browser.get(url)
html = browser.page_source
soup=BeautifulSoup(html,"html.parser")
table = soup.find('table', { "class" : "table table-condensed table-hover data-table m-n-t-15" })
soup2=BeautifulSoup(html,"html.parser")
name = soup2.find('div' , attrs={'class' : 'row m-t-l m-l-l'})
datatable=[]
for record in table.find_all('tr', class_="hidden-xs hidden-sm ng-scope"):
temp_data = []
for data in record.find_all("td"):
temp_data.append(data.text.encode('latin-1'))
newlist = filter(None, temp_data)
datatable.append(newlist)
print name
output.writerows(datatable)
resultcsv.close()
time.sleep(10)
browser.close()
urls = ["https://www.flightradar24.com/data/airports/bud/arrivals", "https://www.flightradar24.com/data/airports/fco/arrivals"]
scrape(urls)
resultcsv.close()
How can I do this in a loop, and how can I do this correctly? Because after that I am writing these data to csv, where the delimiter is ; .
But after web-scraping tables there isn't any ; in the last text, so I think I have to insert a ; in this last text too?!
I am talking about this:
"1:15 PM";" KL1975";"Amsterdam (AMS)-";"KLM";"B737 (PH-BGT) ";"Landed 1:01 PM"
EDITED with the actual date (not working, format issue):
df = pd.DataFrame(newlist)
now = time.strftime('%d-%m-%Y')
df['ACTUALDATE'] = now
#df.rows = header
df.to_csv('output.csv', sep=';', encoding='latin-1', index=False)
I wrote it in the loop, to see the actual date (hours-minutes too, but this is only the day)