I have a training data like below which have all the information under a single column. The data set has above 300000 data.
id features label
1 name=John Matthew;age=25;1.=Post Graduate;2.=Football Player; 1
2 name=Mark clark;age=21;1.=Under Graduate;Interest=Video Games; 1
3 name=David;age=12;1:=High School;2:=Cricketer;native=america; 2
4 name=George;age=11;1:=High School;2:=Carpenter;married=yes 2
.
.
300000 name=Kevin;age=16;1:=High School;2:=Driver;Smoker=No 3
Now i need to convert this training data like below
id name age 1 2 Interest married Smoker
1 John Matthew 25 Post Graduate Football Player Nan Nan Nan
2 Mark clark 21 Under Graduate Nan Video Games Nan Nan
.
.
Is there any efficient way to do this. I tried the below code but it took 3 hours to complete
#Getting the proper features from the features column
cols = {}
for choices in set_label:
collection_list = []
array = train["features"][train["label"] == choices].values
for i in range(1,len(array)):
var_split = array[i].split(";")
try :
d = (dict(s.split('=') for s in var_split))
for x in d.keys():
collection_list.append(x)
except ValueError:
Error = ValueError
count = Counter(collection_list)
for k , v in count.most_common(5):
key = k.replace(":","").replace(" ","_").lower()
cols[key] = v
columns_add = list(cols.keys())
train = train.reindex(columns = np.append( train.columns.values, columns_add))
print (train.columns)
print (train.shape)
#Adding the values for the newly created problem
for row in train.itertuples():
dummy_dic = {}
new_dict={}
value = train.loc[row.Index, 'features']
v_split = value.split(";")
try :
dummy_dict = (dict(s.split('=') for s in v_split))
for k, v in dummy_dict.items():
new_key = k.replace(":","").replace(" ","_").lower()
new_dict[new_key] = v
except ValueError:
Error = ValueError
for k,v in new_dict.items():
if k in train.columns:
train.loc[row.Index, k] = v
Is there any useful function that i can apply here for efficient way of feature extraction ?