Approach based on Pandas outer merge
- Outer merge of df1 & df2 creates all required rows
- Group by Applicant so we can aggregate count
- Use a function to aggregate and produce values for desired output rows
- To keep index of df1 after merge, use method from to keep index after merge
Code
import pandas as pd
import numpy as np
def process(df1, df2):
' Overall function for generating desired output '
def create_result(df, columns = ["Match (Y/N)", "Matched Row reference", "Count"]):
'''
Creates the desired columns of df2
Input:
df - Dataframe from groupby
columns - column names for df2
Output:
Pandas Series corresponding to row in df2
'''
cnt = df['Name'].count() # Number of items in group
if cnt > 0:
# Convert index to comma delimited list, numbered from 1 (i.e. int(x) + 1)
indexes = ','.join(str(int(x) + 1) for x in df.index.to_list())
else:
indexes = "NA" # empty dataframe
lst = ["Y" if cnt > 0 else 'N',
indexes,
df.shape[0] if cnt > 0 else 0]
return pd.Series(lst, index = columns)
# Merge df1 with df2 but
# add method from [to keep index after merge](https://stackoverflow.com/questions/11976503/how-to-keep-index-when-using-pandas-merge/11982843#11982843)
# to have the index of df1 in the merge result
return (df1
.reset_index()
.merge(df2, left_on = "Name", right_on = 'Applicant', how = "outer")
.set_index('index')
.groupby(['Applicant'])
.apply(lambda grp_df: create_result(grp_df)))
Usage
from io import StringIO
s = '''Name Score Year
Pat 82 1990
Chris 38 1993
Pat 92 1994
Noris 88 1997
Mit 62 1999
Chen 58 1996'''
df1 = pd.read_csv(StringIO(s), sep = '\t', engine = 'python')
s = '''Applicant
Pat
Chris
Meet'''
df2 = pd.read_csv(StringIO(s), sep = '\t', engine = 'python')
from pprint import pprint as pp
pp(process(df1, df2)) # process and pretty print result
Output
Match (Y/N) Matched Row reference Count
Applicant
Chris Y 2 1
Meet N NA 0
Pat Y 1,3 2