I have written the following code to load data from a postgres data base and do some operations on it. There are about 1million rows and the kernel keeps dying. When i limit the data size to about 10k, it works.
import psycopg2
import sys, os
import numpy as np
import pandas as pd
import creds as creds
import pandas.io.sql as psql
## ****** LOAD PSQL DATABASE ***** ##
# Sets up a connection to the postgres server.
conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
+" password="+ creds.PGPASSWORD
conn=psycopg2.connect(conn_string)
print("Connected!")
# Create a cursor object
cursor = conn.cursor()
sql_command = "SELECT * FROM {};".format(str("events"))
print (sql_command)
# Load the data
data = pd.read_sql(sql_command, conn)
# taking a subet of the data until algorithm is perfected.
# seed = np.random.seed(42)
# n = data.shape[0]
# ix = np.random.choice(n,10000)
# df_tmp = data.iloc[ix]
# Taking the source and destination and combining it into a list in another column
# df_tmp['accounts'] = df_tmp.apply(lambda x: [x['source'], x['destination']], axis=1)
data['accounts'] = data.apply(lambda x: (x['source'], x['destination']), axis=1)
data['accounts_acc'] = data['accounts'].cumsum().apply(set)
Is there a more efficient way of doing this that doesnt keep failing?