1

I am using following code to run kmeans algorithm on Iris flower dataset- https://github.com/marcoscastro/kmeans/blob/master/kmeans.cpp

I have modified the above code to read input from files. Below is my code -

#include <iostream>
#include <vector>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <algorithm>
#include <fstream>
using namespace std;

class Point
{
private:
    int id_point, id_cluster;
    vector<double> values;
    int total_values;
    string name;

public:
    Point(int id_point, vector<double>& values, string name = "")
    {
        this->id_point = id_point;
        total_values = values.size();

        for(int i = 0; i < total_values; i++)
            this->values.push_back(values[i]);

        this->name = name;
        this->id_cluster = -1;
    }

    int getID()
    {
        return id_point;
    }

    void setCluster(int id_cluster)
    {
        this->id_cluster = id_cluster;
    }

    int getCluster()
    {
        return id_cluster;
    }

    double getValue(int index)
    {
        return values[index];
    }

    int getTotalValues()
    {
        return total_values;
    }

    void addValue(double value)
    {
        values.push_back(value);
    }

    string getName()
    {
        return name;
    }
};

class Cluster
{
private:
    int id_cluster;
    vector<double> central_values;
    vector<Point> points;

public:
    Cluster(int id_cluster, Point point)
    {
        this->id_cluster = id_cluster;

        int total_values = point.getTotalValues();

        for(int i = 0; i < total_values; i++)
            central_values.push_back(point.getValue(i));

        points.push_back(point);
    }

    void addPoint(Point point)
    {
        points.push_back(point);
    }

    bool removePoint(int id_point)
    {
        int total_points = points.size();

        for(int i = 0; i < total_points; i++)
        {
            if(points[i].getID() == id_point)
            {
                points.erase(points.begin() + i);
                return true;
            }
        }
        return false;
    }

    double getCentralValue(int index)
    {
        return central_values[index];
    }

    void setCentralValue(int index, double value)
    {
        central_values[index] = value;
    }

    Point getPoint(int index)
    {
        return points[index];
    }

    int getTotalPoints()
    {
        return points.size();
    }

    int getID()
    {
        return id_cluster;
    }
};

class KMeans
{
private:
    int K; // number of clusters
    int total_values, total_points, max_iterations;
    vector<Cluster> clusters;

    // return ID of nearest center (uses euclidean distance)
    int getIDNearestCenter(Point point)
    {
        double sum = 0.0, min_dist;
        int id_cluster_center = 0;

        for(int i = 0; i < total_values; i++)
        {
            sum += pow(clusters[0].getCentralValue(i) -
                       point.getValue(i), 2.0);
        }

        min_dist = sqrt(sum);

        for(int i = 1; i < K; i++)
        {
            double dist;
            sum = 0.0;

            for(int j = 0; j < total_values; j++)
            {
                sum += pow(clusters[i].getCentralValue(j) -
                           point.getValue(j), 2.0);
            }

            dist = sqrt(sum);

            if(dist < min_dist)
            {
                min_dist = dist;
                id_cluster_center = i;
            }
        }

        return id_cluster_center;
    }

public:
    KMeans(int K, int total_points, int total_values, int max_iterations)
    {
        this->K = K;
        this->total_points = total_points;
        this->total_values = total_values;
        this->max_iterations = max_iterations;
    }

    void run(vector<Point> & points)
    {
        if(K > total_points)
            return;

        vector<int> prohibited_indexes;
                printf("Inside run \n"); 
        // choose K distinct values for the centers of the clusters
        printf(" K distinct cluster\n");
        for(int i = 0; i < K; i++)
        {
            while(true)
            {
                int index_point = rand() % total_points;

                if(find(prohibited_indexes.begin(), prohibited_indexes.end(),
                        index_point) == prohibited_indexes.end())
                {
                    printf("i= %d\n",i);
                    prohibited_indexes.push_back(index_point);
                    points[index_point].setCluster(i);
                    Cluster cluster(i, points[index_point]);
                    clusters.push_back(cluster);
                    break;
                }
            }
        }

        int iter = 1;
        printf(" Each point to nearest  cluster\n");
        while(true)
        {
            bool done = true;

            // associates each point to the nearest center
            for(int i = 0; i < total_points; i++)
            {
                int id_old_cluster = points[i].getCluster();
                int id_nearest_center = getIDNearestCenter(points[i]);

                if(id_old_cluster != id_nearest_center)
                {
                    if(id_old_cluster != -1)
                        clusters[id_old_cluster].removePoint(points[i].getID());

                    points[i].setCluster(id_nearest_center);
                    clusters[id_nearest_center].addPoint(points[i]);
                    done = false;
                }
            }

            // recalculating the center of each cluster
            for(int i = 0; i < K; i++)
            {
                for(int j = 0; j < total_values; j++)
                {
                    int total_points_cluster = clusters[i].getTotalPoints();
                    double sum = 0.0;

                    if(total_points_cluster > 0)
                    {
                        for(int p = 0; p < total_points_cluster; p++)
                            sum += clusters[i].getPoint(p).getValue(j);
                        clusters[i].setCentralValue(j, sum / total_points_cluster);
                    }
                }
            }

            if(done == true || iter >= max_iterations)
            {
                cout << "Break in iteration " << iter << "\n\n";
                break;
            }

            iter++;
        }

        // shows elements of clusters
        for(int i = 0; i < K; i++)
        {
            int total_points_cluster =  clusters[i].getTotalPoints();

            cout << "Cluster " << clusters[i].getID() + 1 << endl;
            for(int j = 0; j < total_points_cluster; j++)
            {
                cout << "Point " << clusters[i].getPoint(j).getID() + 1 << ": ";
                for(int p = 0; p < total_values; p++)
                    cout << clusters[i].getPoint(j).getValue(p) << " ";

                string point_name = clusters[i].getPoint(j).getName();

                if(point_name != "")
                    cout << "- " << point_name;

                cout << endl;
            }

            cout << "Cluster values: ";

            for(int j = 0; j < total_values; j++)
                cout << clusters[i].getCentralValue(j) << " ";

            cout << "\n\n";
        }
    }
};

int main(int argc, char *argv[])
{
    srand(time(NULL));

    int total_points, total_values, K, max_iterations, has_name;

    ifstream inFile("datafile.txt");

    if (!inFile) {
        cerr << "Unable to open file datafile.txt";
        exit(1);   // call system to stop
        }

        inFile >> total_points >> total_values >> K >> max_iterations >> has_name;
        cout << "Details- \n";
        vector<Point> points;

        string point_name,str;
        int i=0;
        while(inFile.eof())
        {

            string temp;
            vector<double> values;
            for(int j = 0; j < total_values; j++)
                {
                double value;
                inFile >> value;
                values.push_back(value);
                } 
            if(has_name)
            {
                inFile >> point_name;
                Point p(i, values, point_name);
                points.push_back(p);
                i++;  
            }
            else
            {   
                inFile >> temp;
                Point p(i, values);
                points.push_back(p);
                i++; 
            }

        }

    inFile.close();

    KMeans kmeans(K, total_points, total_values, max_iterations);
    kmeans.run(points);

    return 0;
}

Output of code is -

 Details- 
 15043100000Inside run
 K distinct cluster i= 0  
 Segmentation fault

When I run it in gdb, the error shown is -

Program received signal SIGSEGV, Segmentation fault.
0x0000000000401db6 in Point::setCluster (this=0x540, id_cluster=0)
    at kmeans.cpp:41
41                      this->id_cluster = id_cluster;

I am stuck at this as I cannot find the cause for this segmentation fault.

My dataset file looks like -

150 4 3 10000 1
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa 
. . .
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor 
. . .
shubhkr1
  • 94
  • 2
  • 15
  • Can you please format your post according to [Minimal, Complete, and Verifiable example](https://stackoverflow.com/help/mcve)? – Smit Ycyken Mar 20 '18 at 19:03
  • Why the explicit `this->` all over the place? If you didn't name your function arguments the same as your member variables you wouldn't need it. *My* convention is to simply prefix member variables with "m_" but anything that results in different names will work. – Jesper Juhl Mar 20 '18 at 19:04
  • If you used vector::at() to access the elements in the vector, you would have received a std::out_of_range exception instead of a segmentation fault. Usage of at() instead of [ ] is a sure-fire way of checking these errors. – PaulMcKenzie Mar 20 '18 at 19:40

2 Answers2

1

in KMeans::run(vector<Point>&) you call points[index_point].setCluster(i); without any guarantee that index_point is within bounds.

index_point is determined by int index_point = rand() % total_points;, and total_points is retrieved from the input file "datafile.txt" which could be anything. It certainly does not have to match points.size(), but it should. Make sure it does, or just use points.size() instead.

A bit offtopic, but using rand() and only using modulo is almost always wrong. If you use C++11 or newer, please consider using std::uniform_int_distribution.

KompjoeFriek
  • 3,572
  • 1
  • 22
  • 35
  • I am using int index_point = rand() % total_points; and value of total_points is taken from datafile.txt (here it is 150) . Does not this guarantee that index_point will be always less than total_points – shubhkr1 Mar 20 '18 at 19:21
  • What i mean is that `total_points` is not guaranteed to match the number of items in `points`. – KompjoeFriek Mar 20 '18 at 19:23
0

points[index_point].setCluster(i); could be accessing the vector out of bounds. The code you quoted actually always sets a number of total_points in the vector points before calling run, while your modified code just reads until end of file and has no guarantees that the number of total points passed to the constructor of KMeans matches the value of entries in points. Either fix your file I/O or fix the logic of bounds checking.

Rudolfs Bundulis
  • 11,636
  • 6
  • 33
  • 71