I am trying to create a Python script that will help me and my colleagues comparing the versions of the same .csv file.
This is my code so far:
import pandas as pd
file1 = (r'C:\Jarek S\results_old.csv')
file2 = (r'C:\Jarek S\results_new.csv')
file3 = (r'C:\Jarek S\results_diff.csv')
cols_to_show = ['emp_id', 'Onbr', 'Measure', 'Status', 'Start', 'End', 'SM', 'Rev',
'PY', 'table', 'terr_id', 'first_nm', 'last_nm', 'job_cd',
'job_title_nm', 'ctry_cd', 'District', 'Fac_nm']
old = pd.read_csv(file1)
new = pd.read_csv(file2)
def report_diff(x):
return x[0] if x[1] == x[0] else '{0} --> {1}'.format(*x)
#old.reset_index(inplace=True, drop=True)
#new.reset_index(inplace=True, drop=True)
old['version'] = 'old'
new['version'] = 'new'
full_set = pd.concat([old, new], ignore_index=True)
changes = full_set.drop_duplicates(subset=cols_to_show, keep='last')
dupe_emp_ids = changes.set_index('emp_id').index.get_duplicates()
dupes = changes[changes['emp_id'].isin(dupe_emp_ids)]
change_new = dupes[(dupes['version'] == 'new')]
change_old = dupes[(dupes['version'] == 'old')]
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)
change_new.set_index('emp_id', inplace=True)
change_old.set_index('emp_id', inplace=True)
diff_panel = pd.Panel(dict(df1=change_old, df2=change_new))
diff_output = diff_panel.apply(report_diff, axis=0)
changes['duplicate'] = changes['emp_id'].isin(dupe_emp_ids)
removed_emp_ids = changes[(changes['duplicate'] == False) & (changes['version'] == 'old')]
removed_emp_ids.set_index('emp_id', inplace=True)
new_emp_id_set = full_set.drop_duplicates(subset=cols_to_show)
new_emp_id_set['duplicate'] = new_emp_id_set['emp_id'].isin(dupe_emp_ids)
added_emp_ids = new_emp_id_set[(new_emp_id_set['duplicate'] == False) & (new_emp_id_set['version'] == 'new')]
added_emp_ids.set_index('emp_id', inplace=True)
df = pd.concat([diff_output, removed_emp_ids, added_emp_ids], keys=('changed', 'removed', 'added'))
df[cols_to_show].to_csv(file3)
What I am trying to do here is compare the files using the column emp_nbr as a reference. Unfortunately, I am relatively new to the python scripting and the only thing I'm getting is the error below:
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-23-7c071d4996a5> in <module>()
----> 1 diff_panel = pd.Panel(dict(df1=change_old, df2=change_new))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in __init__(self, data, items, major_axis, minor_axis, copy, dtype)
152
153 self._init_data(data=data, items=items, major_axis=major_axis,
--> 154 minor_axis=minor_axis, copy=copy, dtype=dtype)
155
156 def _init_data(self, data, copy, dtype, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _init_data(self, data, copy, dtype, **kwargs)
177 mgr = data
178 elif isinstance(data, dict):
--> 179 mgr = self._init_dict(data, passed_axes, dtype=dtype)
180 copy = False
181 dtype = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _init_dict(self, data, axes, dtype)
214 # extract axis for remaining axes & create the slicemap
215 raxes = [self._extract_axis(self, data, axis=i) if a is None else a
--> 216 for i, a in enumerate(axes)]
217 raxes_sm = self._extract_axes_for_slice(self, raxes)
218
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in <listcomp>(.0)
214 # extract axis for remaining axes & create the slicemap
215 raxes = [self._extract_axis(self, data, axis=i) if a is None else a
--> 216 for i, a in enumerate(axes)]
217 raxes_sm = self._extract_axes_for_slice(self, raxes)
218
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _extract_axis(self, data, axis, intersect)
1504 # 2. the indices are not aligned.
1505 index = _get_objs_combined_axis(data.values(), axis=axis,
-> 1506 intersect=intersect, sort=None)
1507
1508 if have_raw_arrays:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _get_objs_combined_axis(objs, intersect, axis, sort)
52 if hasattr(obj, '_get_axis')]
53 if obs_idxes:
---> 54 return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
55
56
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _get_combined_index(indexes, intersect, sort)
67 index = index.intersection(other)
68 else:
---> 69 index = _union_indexes(indexes, sort=sort)
70 index = _ensure_index(index)
71
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _union_indexes(indexes, sort)
105 else:
106 for other in indexes[1:]:
--> 107 result = result.union(other)
108 return result
109 elif kind == 'array':
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in union(self, other)
2766 result.extend([x for x in rvals if x not in value_set])
2767 else:
-> 2768 indexer = self.get_indexer(other)
2769 indexer, = (indexer == -1).nonzero()
2770
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_indexer(self, target, method, limit, tolerance)
3227
3228 if not self.is_unique:
-> 3229 raise InvalidIndexError('Reindexing only valid with uniquely'
3230 ' valued Index objects')
3231
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
I've tried some modifications, but that doesn't help me at all. Can you please help me with the error below? Or at least point to the direction that'll help me to create a script that will compare two .csv files and save the result in the third file?
Thanks!