I'm writing the files just to check results. But my set union is not correct. I need to get the union of diagnosisFilters = [filterRDiagnosisM,filterTDiagnosisM,filterPDiagnosisM,filterADiagnosisM] Currently returning 2 additional rows.
843786 , M , 12.45 , 15.7 , 82.57 , 477.1
869691 , M , 11.8 ,16.58 , 78.99 , 432
Original file looks like this
df = pd.DataFrame({'id': ['842302', '842303', '842304'],
'diagnosis': ['M', 'B', 'M'],
'radius_mean': [20.57, 17.3, 13.3],
'perimeter_mean': [20.57, 20.57, 20.57],
'area_mean': [206.57, 206.57, 240.57],})
breastCancerDataReducedDimensions
Below is my set union code.
diagnosisFilters = [filterRDiagnosisM,filterTDiagnosisM,filterPDiagnosisM,filterADiagnosisM]
diagnosisResult = reduce(lambda left,right: pd.merge(left,right,how='outer',on=["id"]), diagnosisFilters)
I'll leave this here for context until I find a cleaner way to show this without the whole file
import pandas as pd
from functools import reduce
import os
#FOLDERS
vf = './valueFilters'
df = './diagnosisFilters'
results = './results'
if not os.path.exists(vf):
os.mkdir(vf)
if not os.path.exists(df):
os.mkdir(df)
if not os.path.exists(results):
os.mkdir(results)
#DATAFRAMES
my_csv = pd.read_csv('breastCancerDataReducedDimensions.csv')
radius = pd.DataFrame(my_csv[['id','diagnosis', 'radius_mean']])
texture = pd.DataFrame(my_csv[['id','diagnosis','texture_mean']])
perimeter = pd.DataFrame(my_csv[['id','diagnosis','perimeter_mean']])
area = pd.DataFrame(my_csv[['id','diagnosis','area_mean']])
#RADIUS
filterRadius = radius[radius['radius_mean'] >= 13.0]
filterRadius.to_csv(vf+'/q3_gte_13.csv',index=False)
#B
filterRDiagnosisB = radius[radius['diagnosis'] == 'B']
filterRDiagnosisB.to_csv(df+'/q3_B.csv',index=False)
#M
filterRDiagnosisM = radius[radius['diagnosis'] == 'M']
filterRDiagnosisM.to_csv(df+'/q3_M.csv',index=False)
#TEXTURE
filterTexture = texture[texture['texture_mean'] >= 18.0]
filterTexture.to_csv(vf+'/q4_gte_18.csv',index=False)
#B
filterTDiagnosisB = texture[texture['diagnosis'] == 'B']
filterTDiagnosisB.to_csv(df+'/q4_B.csv',index=False)
#M
filterTDiagnosisM = texture[texture['diagnosis'] == 'M']
filterTDiagnosisM.to_csv(df+'/q4_M.csv',index=False)
#PERIMETER
filterPerimeter = perimeter[perimeter['perimeter_mean'] >= 85.0]
filterPerimeter.to_csv(vf+'/q5_gte_85.csv',index=False)
#B
filterPDiagnosisB = perimeter[perimeter['diagnosis'] == 'B']
filterPDiagnosisB.to_csv(df+'/q5_B.csv',index=False)
#M
filterPDiagnosisM = perimeter[perimeter['diagnosis'] == 'M']
filterPDiagnosisM.to_csv(df+'/q5_M.csv',index=False)
#AREA
filterArea = area[area['area_mean'] >= 500.0]
filterArea.to_csv(vf+'/q6_gte_500.csv',index=False)
#B
filterADiagnosisB = area[area['diagnosis'] == 'B']
filterADiagnosisB.to_csv(df+'/q6_B.csv',index=False)
#M
filterADiagnosisM = area[area['diagnosis'] == 'M']
filterADiagnosisM.to_csv(df+'/q6_M.csv',index=False)
#RESULT
intValueFilters = [filterRadius,filterTexture,filterPerimeter,filterArea]
valueResult = reduce(lambda left,right: pd.merge(left,right,how='inner',on=["id"]), intValueFilters)
valueResult.to_csv(results+'/NewResult.csv',index=False)
#CHECK RESULT - METHOD 1
diagnosisFilters = [filterRDiagnosisM,filterTDiagnosisM,filterPDiagnosisM,filterADiagnosisM]
diagnosisResult = reduce(lambda left,right: pd.merge(left,right,how='outer',on=["id"]), diagnosisFilters)
diagnosisResult.to_csv(results+'/SubsetMResult.csv',index=False)
difference1 = diagnosisResult[~diagnosisResult.apply(tuple,1).isin(valueResult.apply(tuple,1))]
pd.DataFrame(difference1).to_csv(results+'/difference1.csv',index=False)
#CHECK RESULT - METHOD 2
originalResult = my_csv[my_csv['diagnosis'] == 'M']
originalResult.to_csv(results+'/OriginalResult.csv',index=False)
difference2 = originalResult[~originalResult.apply(tuple,1).isin(valueResult.apply(tuple,1))]
pd.DataFrame(difference2).to_csv(results+'/difference2.csv',index=False)
#COMPLETE
print('complete')