1

I am trying to append two data frames.

The dataframes have duplicate columns which should be merged into one with new values added as extra rows.

[a b c
 1 2 3
 4 5 6]

[b c d e
 3 4 5 5
 6 7 8 7]

Should be merged as

[a b c d e
 1 2 3
 4 5 6
   3 4 5 5
   6 7 8 7]

I'm tryign to use append for this as followed:

df1.append(df2, ignore_index=True)

which leads to the error that Reindexing is only valid with uniquely valued index objects. I don't understand what is going wrong, I don't care about the index at all. I tried to use reset_index() before merging as well as other functions like concat, merge and join but no success.

How can I solve this issue?

code:

from numpy.core.fromnumeric import _wrapreduction
from numpy.lib.function_base import append
import pandas as pd
import os
import json

def map_prefix(prefix):
    map = {
        'ALPHAOMEGA' : 'ALPHAOMEGA',
        'BIOMARCS' : 'BIOMARCS',
        'DZS_WF': 'DZS_WF/HOORN',
        'RAAK': 'RAAK',
        'ROTTERDAMSTUDIE': 'RS',
        'STEMI-GIPS-III': 'STEMI-GIPS-III',
        'CHECK': 'CHECK',
        'ERF-ERGO': 'ERF',
        'TOMAAT': 'TOMAAT',
        'TACTICS': 'TACTICS',
        'FUNCTGENOMICS': 'FUNCTGENOMICS',
        'VUNTR': 'VUNTR',
        'NESDA': 'NESDA',
        'DMS': 'DMS',
        'CODAM': 'CODAM',
        'UCORBIO': 'UCORBIO',
        'LIFELINES': 'LIFELINES',
        'PROSPER': 'PROSPER',
        'LUMINA': 'CHARMLUMINA',
        'GARP': 'GARP',
        'VUMC_ADC': 'VUMC_ADC',
        'HELIUS': 'HELIUS'
        }
    return map[prefix]

def main():
    directory = '.'
    workdirectory = ''
    measurements = pd.read_csv(open(os.path.join(directory, "measurements.csv")))
    measure_columns = [translate_headers(col) for col in list(measurements.columns)]
    map_measures = dict(zip(measure_columns, list(measurements.columns)))
    measurements = measurements.rename(columns=map_measures)
    #print(measurements)
    for filename in os.listdir(directory):
        if not os.path.isdir(filename):
            print(filename, 'not directory, skip')
            continue
        f = os.path.join(directory, filename)
        # checking if it is a file
        #print(f)
        if not os.path.isfile(f):
            if 'rawdata' in os.listdir(f):
                workdirectory = os.path.join(f, 'rawdata')
                files = os.listdir(workdirectory)
                #print('rawdata', files)
            else:
                workdirectory = f
                files = os.listdir(f)
        files = removedupes(files)
        #print(files)

        for f in files:
            print(f)
            prefix = workdirectory.split('Wave')[1]
            wave = prefix[0]
            prefix = prefix[2::].split('/')[0]
            sep = ''
            #print(prefix, wave, workdirectory, f)
            if f[-4:] == '.tsv':
                sep = '\t'
            elif f[-4:] == '.csv':
                sep = ','
            else:
                #print('passing ', f[-4:])
                continue

            with open(os.path.join(workdirectory, f)) as file:
                with open('log.txt', 'a') as log:
                    log.write('\t'.join([workdirectory, f, 'wave_'+ prefix+'\n']))
                lines = file.readlines()
                if "\t" in lines[0]:
                    sep = '\t'
                elif "," in lines[0]:
                    sep = ','
                elif ";" in lines[0]:
                    sep = ';'
                head = lines[0].strip("\n").split(sep)
                #print(head)
                head = [translate_headers(head2.replace('"', '').lower()) for head2 in head]
                #print(head)
                newlines = []
                for line in lines[1::]:
                    line = line.strip("\n").split(sep)
                    newlines.append(line)
                df = pd.DataFrame(columns=head, data=newlines)
                try:
                    df["sample_id"] = map_prefix(prefix) + "-" + df['sample_id'] + "-w" + wave
                    for i, col in enumerate(df.columns):
                        df.iloc[:, i] = df.iloc[:, i].str.replace('"', '')
                    df.columns = [column.replace('"', '') for column in df.columns]
                except KeyError:
                    print("Oh NO!")

            #print(df.head())
            #print(measurements.head())
            measurements = measurements.reset_index(drop=True)
            df = df.reset_index(drop=True)
            #print(list(df.index), list(measurements.index)[0:30])
            print(measurements.shape, df.shape)
            measurements = pd.concat([measurements, df], ignore_index=True)
            #measurements = measurements.append(df, ignore_index=True)
            #measurements = measurements.merge(df, how='outer', on='sample_id')
            # try:
            #     #print(measurements.shape, df.shape)
                
            #     #print(measurements.info())
            # except Exception as e:
            #     print('fail')

    measurements.to_csv('/home/julian/Documents/NightingaleQuantificationLibary2020/measurements_added.csv', sep=',')      

def removedupes(files):
    filemap = {}
    for file in files:
        filemap[file[:-4]]=file[-4:]
    return [k+v for k,v in filemap.items()]

def translate_headers(head):
    with open('metabolics_map_v1.json', 'r') as j:
        map = json.loads(j.read())
        for k, v in map.items():
            if head in v:
                return k
    return head
    

main()
Moopsish
  • 117
  • 10

2 Answers2

1

There is problem with duplicated columns names like here a, a columns, solution for deduplicated is use GroupBy.cumcount with DataFrame.stack for all DataFrames with duplicates:

#duplicated columns names - sample data
df1.columns = ['a','a','b']
print (df1)
   a  a  b
0  1  2  3
1  4  5  6

s1 = df1.columns.to_series()
df1.columns = [df1.columns, s1.groupby(s1).cumcount()]

#if no duplicated columns names, should omit
#s2 = df2.columns.to_series()
#df2.columns = [df2.columns, s2.groupby(s2).cumcount()]

df11 = df1.stack().reset_index(drop=True) 
print (df11)
   a    b
0  1  3.0
1  2  NaN
2  4  6.0
3  5  NaN

#df22 = df2.stack().reset_index(drop=True) 
#print (df22)
   b  c  d  e
0  3  4  5  5
1  6  7  8  7

df = df11.append(df22, ignore_index=True)
#if no duplicated columnsnames for df2
#df = df11.append(df2, ignore_index=True)
print (df)
     a    b    c    d    e
0  1.0  3.0  NaN  NaN  NaN
1  2.0  NaN  NaN  NaN  NaN
2  4.0  6.0  NaN  NaN  NaN
3  5.0  NaN  NaN  NaN  NaN
4  NaN  3.0  4.0  5.0  5.0
5  NaN  6.0  7.0  8.0  7.0
jezrael
  • 822,522
  • 95
  • 1,334
  • 1,252
0

Why do you want to use append here? You can simply use pandas.concat that is specialized for this task:

df1 = pd.DataFrame({'a': [1,2], 'b': [2,5], 'c': [3,6]})
df2 = pd.DataFrame({'b': [3,6], 'c': [4,7], 'd': [5,8], 'e': [5,7]})
>>> pd.concat([df1,df2], ignore_index=True)
     a  b  c    d    e
0  1.0  2  3  NaN  NaN
1  2.0  5  6  NaN  NaN
2  NaN  3  4  5.0  5.0
3  NaN  6  7  8.0  7.0
mozway
  • 194,879
  • 13
  • 39
  • 75
  • Because concat gives me the same index error. "Reindexing only valid with uniquely valued Index objects." – Moopsish Jul 29 '21 at 14:37
  • I see, you should update your question as the provided dataframes do not allow to reproduce the problem. – mozway Jul 29 '21 at 14:59
  • yeah, I found this out now reproducing it with other datasets. I find it hard to share the dataframes as there are many different ones in different formats im trying to merge. I updated the question to include my code now. – Moopsish Jul 29 '21 at 15:21