I am using following code to run kmeans algorithm on Iris flower dataset- https://github.com/marcoscastro/kmeans/blob/master/kmeans.cpp
I have modified the above code to read input from files. Below is my code -
#include <iostream>
#include <vector>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <algorithm>
#include <fstream>
using namespace std;
class Point
{
private:
int id_point, id_cluster;
vector<double> values;
int total_values;
string name;
public:
Point(int id_point, vector<double>& values, string name = "")
{
this->id_point = id_point;
total_values = values.size();
for(int i = 0; i < total_values; i++)
this->values.push_back(values[i]);
this->name = name;
this->id_cluster = -1;
}
int getID()
{
return id_point;
}
void setCluster(int id_cluster)
{
this->id_cluster = id_cluster;
}
int getCluster()
{
return id_cluster;
}
double getValue(int index)
{
return values[index];
}
int getTotalValues()
{
return total_values;
}
void addValue(double value)
{
values.push_back(value);
}
string getName()
{
return name;
}
};
class Cluster
{
private:
int id_cluster;
vector<double> central_values;
vector<Point> points;
public:
Cluster(int id_cluster, Point point)
{
this->id_cluster = id_cluster;
int total_values = point.getTotalValues();
for(int i = 0; i < total_values; i++)
central_values.push_back(point.getValue(i));
points.push_back(point);
}
void addPoint(Point point)
{
points.push_back(point);
}
bool removePoint(int id_point)
{
int total_points = points.size();
for(int i = 0; i < total_points; i++)
{
if(points[i].getID() == id_point)
{
points.erase(points.begin() + i);
return true;
}
}
return false;
}
double getCentralValue(int index)
{
return central_values[index];
}
void setCentralValue(int index, double value)
{
central_values[index] = value;
}
Point getPoint(int index)
{
return points[index];
}
int getTotalPoints()
{
return points.size();
}
int getID()
{
return id_cluster;
}
};
class KMeans
{
private:
int K; // number of clusters
int total_values, total_points, max_iterations;
vector<Cluster> clusters;
// return ID of nearest center (uses euclidean distance)
int getIDNearestCenter(Point point)
{
double sum = 0.0, min_dist;
int id_cluster_center = 0;
for(int i = 0; i < total_values; i++)
{
sum += pow(clusters[0].getCentralValue(i) -
point.getValue(i), 2.0);
}
min_dist = sqrt(sum);
for(int i = 1; i < K; i++)
{
double dist;
sum = 0.0;
for(int j = 0; j < total_values; j++)
{
sum += pow(clusters[i].getCentralValue(j) -
point.getValue(j), 2.0);
}
dist = sqrt(sum);
if(dist < min_dist)
{
min_dist = dist;
id_cluster_center = i;
}
}
return id_cluster_center;
}
public:
KMeans(int K, int total_points, int total_values, int max_iterations)
{
this->K = K;
this->total_points = total_points;
this->total_values = total_values;
this->max_iterations = max_iterations;
}
void run(vector<Point> & points)
{
if(K > total_points)
return;
vector<int> prohibited_indexes;
printf("Inside run \n");
// choose K distinct values for the centers of the clusters
printf(" K distinct cluster\n");
for(int i = 0; i < K; i++)
{
while(true)
{
int index_point = rand() % total_points;
if(find(prohibited_indexes.begin(), prohibited_indexes.end(),
index_point) == prohibited_indexes.end())
{
printf("i= %d\n",i);
prohibited_indexes.push_back(index_point);
points[index_point].setCluster(i);
Cluster cluster(i, points[index_point]);
clusters.push_back(cluster);
break;
}
}
}
int iter = 1;
printf(" Each point to nearest cluster\n");
while(true)
{
bool done = true;
// associates each point to the nearest center
for(int i = 0; i < total_points; i++)
{
int id_old_cluster = points[i].getCluster();
int id_nearest_center = getIDNearestCenter(points[i]);
if(id_old_cluster != id_nearest_center)
{
if(id_old_cluster != -1)
clusters[id_old_cluster].removePoint(points[i].getID());
points[i].setCluster(id_nearest_center);
clusters[id_nearest_center].addPoint(points[i]);
done = false;
}
}
// recalculating the center of each cluster
for(int i = 0; i < K; i++)
{
for(int j = 0; j < total_values; j++)
{
int total_points_cluster = clusters[i].getTotalPoints();
double sum = 0.0;
if(total_points_cluster > 0)
{
for(int p = 0; p < total_points_cluster; p++)
sum += clusters[i].getPoint(p).getValue(j);
clusters[i].setCentralValue(j, sum / total_points_cluster);
}
}
}
if(done == true || iter >= max_iterations)
{
cout << "Break in iteration " << iter << "\n\n";
break;
}
iter++;
}
// shows elements of clusters
for(int i = 0; i < K; i++)
{
int total_points_cluster = clusters[i].getTotalPoints();
cout << "Cluster " << clusters[i].getID() + 1 << endl;
for(int j = 0; j < total_points_cluster; j++)
{
cout << "Point " << clusters[i].getPoint(j).getID() + 1 << ": ";
for(int p = 0; p < total_values; p++)
cout << clusters[i].getPoint(j).getValue(p) << " ";
string point_name = clusters[i].getPoint(j).getName();
if(point_name != "")
cout << "- " << point_name;
cout << endl;
}
cout << "Cluster values: ";
for(int j = 0; j < total_values; j++)
cout << clusters[i].getCentralValue(j) << " ";
cout << "\n\n";
}
}
};
int main(int argc, char *argv[])
{
srand(time(NULL));
int total_points, total_values, K, max_iterations, has_name;
ifstream inFile("datafile.txt");
if (!inFile) {
cerr << "Unable to open file datafile.txt";
exit(1); // call system to stop
}
inFile >> total_points >> total_values >> K >> max_iterations >> has_name;
cout << "Details- \n";
vector<Point> points;
string point_name,str;
int i=0;
while(inFile.eof())
{
string temp;
vector<double> values;
for(int j = 0; j < total_values; j++)
{
double value;
inFile >> value;
values.push_back(value);
}
if(has_name)
{
inFile >> point_name;
Point p(i, values, point_name);
points.push_back(p);
i++;
}
else
{
inFile >> temp;
Point p(i, values);
points.push_back(p);
i++;
}
}
inFile.close();
KMeans kmeans(K, total_points, total_values, max_iterations);
kmeans.run(points);
return 0;
}
Output of code is -
Details-
15043100000Inside run
K distinct cluster i= 0
Segmentation fault
When I run it in gdb, the error shown is -
Program received signal SIGSEGV, Segmentation fault.
0x0000000000401db6 in Point::setCluster (this=0x540, id_cluster=0)
at kmeans.cpp:41
41 this->id_cluster = id_cluster;
I am stuck at this as I cannot find the cause for this segmentation fault.
My dataset file looks like -
150 4 3 10000 1
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
. . .
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
. . .