Definitely use pandas Dataframes. Here are references and very simple Dataframe examples.
#reference: http://pandas.pydata.org/pandas-docs/stable/10min.html
#reference: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html.
import numpy as np
import pandas as pd
def df_dupes(df_in):
'''
Returns [object,count] pairs for each unique item in the dataframe.
'''
# import pandas
if isinstance(df_in, list) or isinstance(df_in, tuple):
import pandas as pd
df_in = pd.DataFrame(df_in)
return df_in.groupby(df_in.columns.tolist(),as_index=False).size()
def df_filter_example(df):
'''
In [96]: df
Out[96]:
A B C D
0 1 4 9 1
1 4 5 0 2
2 5 5 1 0
3 1 3 9 6
'''
import pandas as pd
df=pd.DataFrame([[1,4,9,1],[4,5,0,2],[5,5,1,0],[1,3,9,6]],columns=['A','B','C','D'])
return df[(df.A == 1) & (df.D == 6)]
def df_compare(df1, df2, compare_col_list, join_type):
'''
df_compare compares 2 dataframes.
Returns left, right, inner or outer join
df1 is the first/left dataframe
df2 is the second/right dataframe
compare_col_list is a lsit of column names that must match between df1 and df2
join_type = 'inner', 'left', 'right' or 'outer'
'''
import pandas as pd
return pd.merge(df1, df2, how=join_type,
on=compare_col_list)
def df_compare_examples():
import numpy as np
import pandas as pd
df1=pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns = ['c1', 'c2', 'c3'])
''' c1 c2 c3
0 1 2 3
1 4 5 6
2 7 8 9 '''
df2=pd.DataFrame([[4,5,6],[7,8,9],[10,11,12]], columns = ['c1', 'c2', 'c3'])
''' c1 c2 c3
0 4 5 6
1 7 8 9
2 10 11 12 '''
# One can see that df1 contains 1 row ([1,2,3]) not in df3 and
# df2 contains 1 rown([10,11,12]) not in df1.
# Assume c1 is not relevant to the comparison. So, we merge on cols 2 and 3.
df_merge = pd.merge(df1,df2,how='outer',on=['c2','c3'])
print(df_merge)
''' c1_x c2 c3 c1_y
0 1 2 3 NaN
1 4 5 6 4
2 7 8 9 7
3 NaN 11 12 10 '''
''' One can see that columns c2 and c3 are returned. We also received
columns c1_x and c1_y, where c1_X is the value of column c1
in the first dataframe and c1_y is the value of c1 in the second
dataframe. As such,
any row that contains c1_y = NaN is a row from df1 not in df2 &
any row that contains c1_x = NaN is a row from df2 not in df1. '''
df1_unique = pd.merge(df1,df2,how='left',on=['c2','c3'])
df1_unique = df1_unique[df1_unique['c1_y'].isnull()]
print(df1_unique)
df2_unique = pd.merge(df1,df2,how='right',on=['c2','c3'])
print(df2_unique)
df_common = pd.merge(df1,df2,how='inner',on=['c2','c3'])
print(df_common)
def delete_column_example():
print 'create df'
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['a','b','c'])
print 'drop (delete/remove) column'
col_name = 'b'
df.drop(col_name, axis=1, inplace=True) # or df = df.drop('col_name, 1)
def delete_rows_example():
print '\n\ncreate df'
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['col_1','col_2','col_3'])
print(df)
print '\n\nappend rows'
df= df.append(pd.DataFrame([[11,22,33]], columns=['col_1','col_2','col_3']))
print(df)
print '\n\ndelete rows where (based on) column value'
df = df[df.col_1 == 4]
print(df)