I am trying to append two data frames.
The dataframes have duplicate columns which should be merged into one with new values added as extra rows.
[a b c
1 2 3
4 5 6]
[b c d e
3 4 5 5
6 7 8 7]
Should be merged as
[a b c d e
1 2 3
4 5 6
3 4 5 5
6 7 8 7]
I'm tryign to use append for this as followed:
df1.append(df2, ignore_index=True)
which leads to the error that Reindexing is only valid with uniquely valued index objects
.
I don't understand what is going wrong, I don't care about the index at all. I tried to use reset_index()
before merging as well as other functions like concat, merge and join but no success.
How can I solve this issue?
code:
from numpy.core.fromnumeric import _wrapreduction
from numpy.lib.function_base import append
import pandas as pd
import os
import json
def map_prefix(prefix):
map = {
'ALPHAOMEGA' : 'ALPHAOMEGA',
'BIOMARCS' : 'BIOMARCS',
'DZS_WF': 'DZS_WF/HOORN',
'RAAK': 'RAAK',
'ROTTERDAMSTUDIE': 'RS',
'STEMI-GIPS-III': 'STEMI-GIPS-III',
'CHECK': 'CHECK',
'ERF-ERGO': 'ERF',
'TOMAAT': 'TOMAAT',
'TACTICS': 'TACTICS',
'FUNCTGENOMICS': 'FUNCTGENOMICS',
'VUNTR': 'VUNTR',
'NESDA': 'NESDA',
'DMS': 'DMS',
'CODAM': 'CODAM',
'UCORBIO': 'UCORBIO',
'LIFELINES': 'LIFELINES',
'PROSPER': 'PROSPER',
'LUMINA': 'CHARMLUMINA',
'GARP': 'GARP',
'VUMC_ADC': 'VUMC_ADC',
'HELIUS': 'HELIUS'
}
return map[prefix]
def main():
directory = '.'
workdirectory = ''
measurements = pd.read_csv(open(os.path.join(directory, "measurements.csv")))
measure_columns = [translate_headers(col) for col in list(measurements.columns)]
map_measures = dict(zip(measure_columns, list(measurements.columns)))
measurements = measurements.rename(columns=map_measures)
#print(measurements)
for filename in os.listdir(directory):
if not os.path.isdir(filename):
print(filename, 'not directory, skip')
continue
f = os.path.join(directory, filename)
# checking if it is a file
#print(f)
if not os.path.isfile(f):
if 'rawdata' in os.listdir(f):
workdirectory = os.path.join(f, 'rawdata')
files = os.listdir(workdirectory)
#print('rawdata', files)
else:
workdirectory = f
files = os.listdir(f)
files = removedupes(files)
#print(files)
for f in files:
print(f)
prefix = workdirectory.split('Wave')[1]
wave = prefix[0]
prefix = prefix[2::].split('/')[0]
sep = ''
#print(prefix, wave, workdirectory, f)
if f[-4:] == '.tsv':
sep = '\t'
elif f[-4:] == '.csv':
sep = ','
else:
#print('passing ', f[-4:])
continue
with open(os.path.join(workdirectory, f)) as file:
with open('log.txt', 'a') as log:
log.write('\t'.join([workdirectory, f, 'wave_'+ prefix+'\n']))
lines = file.readlines()
if "\t" in lines[0]:
sep = '\t'
elif "," in lines[0]:
sep = ','
elif ";" in lines[0]:
sep = ';'
head = lines[0].strip("\n").split(sep)
#print(head)
head = [translate_headers(head2.replace('"', '').lower()) for head2 in head]
#print(head)
newlines = []
for line in lines[1::]:
line = line.strip("\n").split(sep)
newlines.append(line)
df = pd.DataFrame(columns=head, data=newlines)
try:
df["sample_id"] = map_prefix(prefix) + "-" + df['sample_id'] + "-w" + wave
for i, col in enumerate(df.columns):
df.iloc[:, i] = df.iloc[:, i].str.replace('"', '')
df.columns = [column.replace('"', '') for column in df.columns]
except KeyError:
print("Oh NO!")
#print(df.head())
#print(measurements.head())
measurements = measurements.reset_index(drop=True)
df = df.reset_index(drop=True)
#print(list(df.index), list(measurements.index)[0:30])
print(measurements.shape, df.shape)
measurements = pd.concat([measurements, df], ignore_index=True)
#measurements = measurements.append(df, ignore_index=True)
#measurements = measurements.merge(df, how='outer', on='sample_id')
# try:
# #print(measurements.shape, df.shape)
# #print(measurements.info())
# except Exception as e:
# print('fail')
measurements.to_csv('/home/julian/Documents/NightingaleQuantificationLibary2020/measurements_added.csv', sep=',')
def removedupes(files):
filemap = {}
for file in files:
filemap[file[:-4]]=file[-4:]
return [k+v for k,v in filemap.items()]
def translate_headers(head):
with open('metabolics_map_v1.json', 'r') as j:
map = json.loads(j.read())
for k, v in map.items():
if head in v:
return k
return head
main()