I am working on the classical titanic dataset in kaggle and there are four dataframes in my project the test and training set and their copys to perform some operations on them. So my problem is even though I make the same operations on my test and train datasets my test datasets "Cabin" column becomes Nan. I couldn't understand the problem.
Code block 1:
training=pd.read_csv("../input/titanic/train.csv")
copytrain = training
training=training.drop(columns=["Ticket","PassengerId"])
test=pd.read_csv("../input/titanic/test.csv")
test=test.drop(columns=["Ticket","PassengerId"])
copytest=test
Code block 2:
for i in range(len(training["Cabin"])):
if "A" in str(training.loc[i,"Cabin"]):
training.loc[i,"Cabin"] = "A"
if "B" in str(training.loc[i,"Cabin"]):
training.loc[i,"Cabin"] = "B"
if "C" in str(training.loc[i,"Cabin"]):
training.loc[i,"Cabin"] = "C"
if "D" in str(training.loc[i,"Cabin"]):
training.loc[i,"Cabin"] = "D"
if "E" in str(training.loc[i,"Cabin"]):
training.loc[i,"Cabin"] = "E"
if "F" in str(training.loc[i,"Cabin"]):
training.loc[i,"Cabin"] = "F"
if "G" in str(training.loc[i,"Cabin"]):
training.loc[i,"Cabin"] = "G"
for i in range(len(test["Cabin"])):
if "A" in str(test.loc[i,"Cabin"]):
test.loc[i,"Cabin"] = "A"
if "B" in str(test.loc[i,"Cabin"]):
test.loc[i,"Cabin"] = "B"
if "C" in str(test.loc[i,"Cabin"]):
test.loc[i,"Cabin"] = "C"
if "D" in str(test.loc[i,"Cabin"]):
test.loc[i,"Cabin"] = "D"
if "E" in str(test.loc[i,"Cabin"]):
test.loc[i,"Cabin"] = "E"
if "F" in str(test.loc[i,"Cabin"]):
test.loc[i,"Cabin"] = "F"
if "G" in str(test.loc[i,"Cabin"]):
test.loc[i,"Cabin"] = "G"
training.loc[training["Cabin"]=="T","Cabin"]="A"
copytrain.loc[copytrain["Cabin"]=="T","Cabin"]="A"
test.loc[test["Cabin"]=="T","Cabin"]="A"
copytest.loc[test["Cabin"]=="T","Cabin"]="A"
training.loc[training.Cabin.isnull(),"Cabin"]="U"
test.loc[test.Cabin.isnull(),"Cabin"]="U"
test.Cabin.value_counts()
output:
U 327
C 35
B 18
D 13
E 11
A 7
F 6
G 1
Name: Cabin, dtype: int64
Code block 3:
copytrain["Cabin"]= copytrain["Cabin"].str.replace("A","")
copytrain["Cabin"]= copytrain["Cabin"].str.replace("B","")
copytrain["Cabin"]= copytrain["Cabin"].str.replace("C","")
copytrain["Cabin"]= copytrain["Cabin"].str.replace("D","")
copytrain["Cabin"]= copytrain["Cabin"].str.replace("E","")
copytrain["Cabin"]= copytrain["Cabin"].str.replace("F","")
copytrain["Cabin"]= copytrain["Cabin"].str.replace("G","")
#-------------------------------------------------------
copytest["Cabin"]= copytest["Cabin"].str.replace("A","")
copytest["Cabin"]= copytest["Cabin"].str.replace("B","")
copytest["Cabin"]= copytest["Cabin"].str.replace("C","")
copytest["Cabin"]= copytest["Cabin"].str.replace("D","")
copytest["Cabin"]= copytest["Cabin"].str.replace("E","")
copytest["Cabin"]= copytest["Cabin"].str.replace("F","")
copytest["Cabin"]= copytest["Cabin"].str.replace("G","")
copytrain.loc[(copytrain["Cabin"].str.len()==5) | (copytrain["Cabin"].str.len()==8),"Cabin"]=copytrain.Cabin.str.slice(stop=2)
copytrain.loc[(copytrain["Cabin"].str.len()==7) | (copytrain["Cabin"].str.len()==11),"Cabin"]=copytrain.Cabin.str.slice(stop=3)
#----------------------------------------------------------------------------------------------------------------------------
copytest.loc[(copytest["Cabin"].str.len()==5) | (copytest["Cabin"].str.len()==8),"Cabin"]=copytest.Cabin.str.slice(stop=2)
copytest.loc[(copytest["Cabin"].str.len()==7) | (copytest["Cabin"].str.len()==11),"Cabin"]=copytest.Cabin.str.slice(stop=3)
copytrain["Cabin"]=pd.to_numeric(copytrain["Cabin"],errors="coerce")
copytest["Cabin"]=pd.to_numeric(copytest["Cabin"],errors="coerce")
test.Cabin.value_counts()
output:
Series([], Name: Cabin, dtype: int64)
Here are the first five rows of the training and test dataframes:
Surv Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 3 male 22.0 1 0 7.2500 U S
1 1 female 38.0 1 0 71.2833 C C
1 3 female 26.0 0 0 7.9250 U S
1 1 female 35.0 1 0 53.1000 C S
0 3 male 35.0 0 0 8.0500 U S`
Pclass Sex Age SibSp Parch Fare Cabin Embarked
3 male 34.5 0 0 7.8292 U Q
3 female 47.0 1 0 7.0000 U S
2 male 62.0 0 0 9.6875 U Q
3 male 27.0 0 0 8.6625 U S
3 female 22.0 1 1 12.2875 U S
Sometimes in jupyter the code is really buggy but this time I tried deleting the blocks and it didn't work.