1

I am scraping a table that's online and gets updated every 15 minutes. I am trying to write a script that uses the date and time columns to check if an entry already exists, and if not, insert the new data.

However the IF statement at the very bottom always says "no new data to insert" even if the table has been updated. If I delete the IF statement altogether and just have it insert_data(), it will insert the data as expected, so I know the mechanics up to there work.

Can anyone see what is wrong?

import requests
import pandas as pd
import psycopg2
import datetime
import time
from bs4 import BeautifulSoup

# GET THE DATA
url = 'https://harvardforest.fas.harvard.edu/weather/15-minute-tables-metric-units'

resp = requests.get(url)

soup = BeautifulSoup(resp.content, 'html.parser')

fishbox = soup.find('pre')
for tag in fishbox.find_all():
    tag.extract()

table = fishbox.text.strip()
#tableS = table.split('\n')
print(table.split('\r\n')[-1])

# INSERT THE DATA
def create_table():
    '''Create table in database.'''
    try:
        conn = psycopg2.connect('''
                                dbname = 'sensor'
                                user = 'user'
                                password = 'pass'
                                port = '5432'
                                host = 'database-1.sadasdasd.us-east-1.rds.amazonaws.com'
                                ''')
        cur = conn.cursor()
        # Create table for database.
        cur.execute('''create table if not exists fisherMetStation (
        timestamp TIMESTAMP NOT NULL, 
        Date text,
        Time text,
        AirT_C text,
        Rh text,
        DewP text,
        Prec text,
        SlrR text,
        ParR text,
        NetR text,
        Bar text,
        Wspd text,
        Wdir text,
        Gspd text,
        S10T text
    )''')
    except Exception as e:
        print(e)
        return
    conn.commit()
    conn.close()

create_table()

def insert_data(timestamp, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14):
    '''Insert data into the database.'''
    conn = psycopg2.connect('''
                            dbname = 'sensor'
                            user = 'user'
                            password = 'pass'
                            port = '5432'
                            host = 'database-1.sadasdasd.us-east-1.rds.amazonaws.com'
                            ''')
    cur = conn.cursor()
    cur.execute('''INSERT INTO fisherMetStation
                (timestamp, Date, Time, AirT_C, Rh, DewP, Prec, SlrR, ParR, NetR, Bar, Wspd, Wdir, Gspd, S10T) 
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', (timestamp, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14))
    conn.commit()
    conn.close()

def read_data():
    '''Insert data into the database.'''
    conn = psycopg2.connect('''
                            dbname = 'sensor'
                            user = 'user'
                            password = 'pass'
                            port = '5432'
                            host = 'database-1.sadasdasd.us-east-1.rds.amazonaws.com'
                            ''')
    cur = conn.cursor()
    cur.execute('''select Date, Time from fisherMetStation ORDER BY timestamp DESC LIMIT 1''')
    records = cur.fetchall() 
    print(records[0][0], records[0][1])
    conn.commit()
    conn.close()

data1 = table.split('\r\n')[-1]
data2 = data1.split()
time_now = datetime.datetime.today()
timestamp = datetime.datetime.strftime(time_now, "%Y-%m-%d %H:%M:%S")
sensor1 = data2[0]
sensor2 = data2[1]
sensor3 = data2[2]
sensor4 = data2[3]
sensor5 = data2[4]
sensor6 = data2[5]
sensor7 = data2[6]
sensor8 = data2[7]
sensor9 = data2[8]
sensor10 = data2[9]
sensor11 = data2[10]
sensor12 = data2[11]
sensor13 = data2[12]
sensor14 = data2[13]

# Save data in a Postgresql database.
while True: 
    # run every 10 minutes
    time.sleep(600)
    if read_data() != print(sensor1, sensor2):
        insert_data(timestamp, sensor1, sensor2, sensor3, sensor4, sensor5, sensor6, sensor7, sensor8, sensor9, sensor10, sensor11, sensor12, sensor13, sensor14)
        print("data inserted")    
    else:
        print("no new data to insert")
santo
  • 418
  • 1
  • 3
  • 13
LevelChart8
  • 207
  • 1
  • 8
  • 1
    `read_data` and `print` do not return a value. You're basically comparing `None` with `None`. You can't just compare two expressions that `print`. the `==` operator compares expressions' values, not what they `print` – Omer Tuchfeld Mar 14 '20 at 18:28
  • try f"{sensor1}{sensor2}" if you are using python3 – Kristian Mar 14 '20 at 18:28
  • Both read_data() and print() return None, so it’s no surprise that the if fails. – quamrana Mar 14 '20 at 18:28
  • @Kristian - thanks! That works for the incoming data. But what would I use in place oof `read_data()`? – LevelChart8 Mar 14 '20 at 18:32
  • @LevelChart8 use a return statement in read_data, not a print statement. Be sure to close your conn before you return. – Kristian Mar 14 '20 at 18:49
  • @Kristian - like this? `return(f"{records[0][0]}{records[0][1]}")` – LevelChart8 Mar 14 '20 at 18:50
  • Don't use `except Exception` like that, see https://stackoverflow.com/questions/54948548/what-is-wrong-with-using-a-bare-except. – AMC Mar 14 '20 at 19:37

1 Answers1

1
#####you should return records in read_data######
def read_data():
    # your code
    records = cur.fetchall()
    result=[]
    for rec in records[0]:
        result.append(rec)
    print(records[0][0], records[0][1])
    conn.commit()
    conn.close()
    return result

#compare time in records with sensor1(time) and date in records with sensor(2) if any of them do not match then execute if
result=read_data()
if result[0] != sensor1 or result[1] !=  sensor2:
        insert_data(timestamp, sensor1, sensor2, sensor3, sensor4, sensor5, sensor6, sensor7, sensor8, sensor9, sensor10, sensor11, sensor12, sensor13, sensor14)
        print("data inserted")    
    else:
        print("no new data to insert")