how does one do k means on multiple columns in structured data ?
In the example below its been done on 1 column (name)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_new['name'])
here only name is used but say we wanted to use name and country, should I be adding country to the same column as follows ?
df_new['name'] = df_new['name'] + " " + df_new['country']
tfidf_matrix = tfidf_vectorizer.fit_transform(df_new['name'])
It works from a code perspective and am still trying to understand the results (I actually have tons of columns) the data but I wonder if that is the right way to fit when there is more than one columns
import os
import pandas as pd
import re
import numpy as np
df = pd.read_csv('sample-data.csv')
def split_description(string):
# name
string_split = string.split(' - ',1)
name = string_split[0]
return name
df_new = pd.DataFrame()
df_new['name'] = df.loc[:,'description'].apply(lambda x: split_description(x))
df_new['id'] = df['id']
def remove(name):
new_name = re.sub("[0-9]", '', name)
new_name = ' '.join(new_name.split())
return new_name
df_new['name'] = df_new.loc[:,'name'].apply(lambda x: remove(x))
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
use_idf=True,
stop_words = 'english',
ngram_range=(1,4), min_df = 0.01, max_df = 0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_new['name'])
print (tfidf_matrix.shape)
print (tfidf_vectorizer.get_feature_names())
from sklearn.metrics.pairwise import cosine_similarity
dist = 1.0 - cosine_similarity(tfidf_matrix)
print (dist)
from sklearn.cluster import KMeans
num_clusters = range(1,20)
KM = [KMeans(n_clusters=k, random_state = 1).fit(tfidf_matrix) for k in num_clusters]