1

This is a sample implementation of KD-tree. Where I first take number of dimensions, number of points, number of clusters to be formed. Bi-partition function calculates centroid, dimension which has max variance. Now based on the max dimensions mean I start splitting the points. This program works fine when input is (dimensions-2,points-20,clusters-4). But does not work for (dimensions-2,points-20,clusters-8). When I debug the program it gives proper output.But when I run the program it stops working.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <stdbool.h>

int *gendata(int num);
void bipartition_fn(int dimensions,int nodes,int i0, int im, int *data,int *cluster_size,int *cluster_start, int *cluster_bdry, int *cluster_centroid);
void kdtree_fn(int dimensions, int nodes, int k, int *data,int *k_cluster_size,int **k_cluster_centroid,int *k_cluster_start,int **k_cluster_bdry );

int main()
{
    int dimensions,nodes,i,k,j;
    printf("enter the number of dimensions");
    scanf("%d",&dimensions);
    printf("enter the total number of elements in multiples of dimensions");
    scanf("%d", &nodes);
    printf("enter number of clusters");
    scanf("%d",&k);
    int *data;
    int *k_cluster_size;
    int **k_cluster_centroid;
    int *k_cluster_start;
    int **k_cluster_bdry;
    data = gendata(nodes); /*dynamic array generation for data points*/

    k_cluster_bdry=(int **)malloc(k*sizeof(int *));
    for(i=0;i<(2*k-2);i++)
        *(k_cluster_bdry+i)=(int *)malloc(2*dimensions*sizeof(int));
    k_cluster_centroid=(int **)malloc(k*sizeof(int *));
    for(i=0;i<(2*k-2);i++)
        *(k_cluster_centroid+i)=(int *)malloc(dimensions*sizeof(int));
    k_cluster_size=malloc((2*k-2)*sizeof(int));
    k_cluster_start = malloc((2*k-2)*sizeof(int));

    /*calling the kdtree function*/
    kdtree_fn(dimensions, nodes, k, data, k_cluster_size, k_cluster_centroid, k_cluster_start, k_cluster_bdry);

    /*printing the cluster size */

    printf("cluster size \n");
    for(i=k-2; i<(2*k - 2); i++){
        printf("%d ", k_cluster_size[i]);
    }

    free(data);
    free(k_cluster_bdry);
    free(k_cluster_centroid);
    free(k_cluster_size);
    free(k_cluster_start);
    return 0;
}


void kdtree_fn(int dimensions, int nodes, int k, int *data,int *k_cluster_size,int **k_cluster_centroid,int *k_cluster_start,int **k_cluster_bdry){

    int i,j,d0,dm,x,m=0,l,n=0,check=1,s,temp=0;
    d0 = 0, dm =nodes ;
    int *cluster_size, *cluster_start;
    int *cluster_bdry;
    int *cluster_centroid;
    int *query;
    int *res;

    query = (int *)malloc(sizeof(int)*dimensions);
    res = (int *)malloc(sizeof(int)*dimensions);
    cluster_centroid = (int*)malloc(dimensions*sizeof(int));
    cluster_bdry = (int*)malloc(4*dimensions*sizeof(int));
    cluster_size=(int*)malloc(2*sizeof(int));
    cluster_start = (int*)malloc(2*sizeof(int));

/* iterating k-1 times to form k clusters */
    for(x=0 ; x<k-1; x++){

    bipartition_fn(dimensions, nodes, d0, dm, data, cluster_size, cluster_start, cluster_bdry, cluster_centroid);

    for( i=0;i<dimensions; i++){
    k_cluster_centroid[x][i] = cluster_centroid[i];
    }
    for( i=0;i<2; i++){
    k_cluster_size[m] = cluster_size[i];
    k_cluster_start[m] = cluster_start[i];
    m++;
    }
    int p=0,r=0;
    while(p<2){
        l=0;
        i=0;
        while(i<2*dimensions){
            k_cluster_bdry[temp][l] = cluster_bdry[r];
            l++;
            i++;
            r++;
        }
        temp++;
        p++;
    }

    s = pow(2,check);


    if(x == 0 ||(x%(s-2)) == 0){
        d0 =0;
        nodes = k_cluster_size[n];
        check++;
        n++;
    }
    else{
        d0 = d0+k_cluster_size[n-1];
        nodes = k_cluster_size[n];
        n++;
    }
    }

    free(cluster_bdry);
    free(cluster_centroid);
    free(cluster_size);
    free(cluster_start);
}

/*Each bipartition function gives 2 clusters*/
void bipartition_fn(int dimensions,int nodes,int d0, int dm, int *data,int *cluster_size,int *cluster_start, int *cluster_bdry, int *cluster_centroid){

    int i,j,x,k;
    int  node = nodes/dimensions;
    int sum,min,max;
    int *cluster_assign;
    cluster_assign = malloc(nodes*sizeof(int));
    int *assign;
    assign= (int *)malloc(node*sizeof(int));
   // printf("nodes: %d \n", nodes);

        /*calculate centroid and boundaries*/
    i=0;
    j=d0;
    while(i<dimensions){
            sum=0;
        while(j<(d0+nodes)){
            sum = sum+data[j];
            j = j+dimensions;
        }
    cluster_centroid[i] = sum/node;
        i = i+1;
        j=d0+i;
    }

    /* Calculate variance of each dimension and find dimension with maximum variance*/
     int var[dimensions],g,h;
     h=d0;
     g=0;
    while(g<dimensions){
        sum = 0;
        while(h<(d0+nodes)){
           sum = sum +((cluster_centroid[g] - data[h])*(cluster_centroid[g] - data[h]));
           h=h+dimensions;
        }
        var[g] = sum/node;
        g=g+1;
        h=(d0+g);
    }
    int large = var[0];
    int max_dimension =0;
    int p;
    for(p=0; p<dimensions; p++){
        if(var[p]>large){
            large = var[p];
            max_dimension = p;
        }
   }


   /* find mean of maximum variance*/
    int mean = cluster_centroid[max_dimension];
    //printf("mean %d \n",mean);
    i=d0+max_dimension;
    x=0;
    while(i<(d0+nodes)){
        if(data[i] < mean){
           assign[x]=0;
        }
        else{
            assign[x]=1;
        }
        x++;
        i= i+dimensions;
    }

    /* Rearranging the points based on mean points lesser than mean goes to left and greater than mean goes to right*/
    x=0;
    int count=0;
    int y=0;
    for(i=0; i<node; i++){
        if(assign[y] == 0){
                count++;
            for(j=dimensions*i; j<dimensions*(i+1); j++){

                cluster_assign[x] = data[d0+j];
                x++;
            }
        }
        y++;
    }
    cluster_size[0] = count*dimensions;
    cluster_start[0]= d0;

    count=0;
    y=0;
    for(i=0; i<node; i++){
        if(assign[y]!=0){
            count++;
            for(j=dimensions*i; j<dimensions*(i+1); j++){
                cluster_assign[x] = data[d0+j];
                x++;
            }
        }
        y++;
    }
    cluster_size[1] = count*dimensions;
    cluster_start[1]= d0+cluster_size[0];

    int temp1,temp2;
    x=0;
    p=0;
    while(p<2){
        j=cluster_start[p];
        i=0;
        while(i<dimensions){
            min=data[j];
            max=data[j];
            temp1=cluster_start[p];
            temp2=cluster_size[p];
            while(j < temp1+temp2){

                if(data[j]<min)
                    min = data[j];
                if(data[j]>max)
                    max= data[j];
                j = j+dimensions;
            }
            cluster_bdry[x]=min;
            x=x+1;
            cluster_bdry[x]=max;
            x=x+1;
            i = i+1;
            j=temp1+i;
        }
        p++;
    }
    /*printf("bou");
    for(i=0; i<4*dimensions; i++){
        printf("%d ",cluster_bdry[i]);
    } */
    free(cluster_assign);
    free(assign);

}



/*Initialize data array*/
int *gendata(int num)
{
    int *ptr = (int *)malloc(sizeof(int)*num);
    int j = 0;
    if(ptr != NULL)
    {
        for(j = 0; j < num; j++)
        {
            ptr[j] = -50 + rand()%101;
        }
    }
    return ptr;
}
Moni
  • 39
  • 6
  • 1
    `k_cluster_bdry=(int **)malloc(k*sizeof(int *)); for(i=0;i<(2*k-2);i++)` : `i` exceeds `k`. So `*(k_cluster_centroid+i)=...` occurs out-of-bounds error. – BLUEPIXY Sep 17 '17 at 06:41
  • why does it work fine for few inputs? why does it work fine when it is debugged? why not for others? – Moni Sep 17 '17 at 06:43
  • 2
    What operating system, compiler and compilation option? And read carefully about [undefined behavior](https://en.wikipedia.org/wiki/Undefined_behavior) – Basile Starynkevitch Sep 17 '17 at 06:44
  • Thanks a lot. It solved the issue. Its working fine now – Moni Sep 17 '17 at 06:50
  • 1
    One of the beautifully dangerous things about undefined behaviour is precisely that one of the acceptable responses is to behave more or less as expected. That means that on some machines, you may not spot the problem, but on others, you will. It's also a reason for compiling the code with as many compilers as possible on as many machines as possible with as many compilation warnings enabled as you can stomach — precisely to try and avoid undefined behaviour. Note that adding or removing optimization options can affect the behaviour if there is undefined behaviour. – Jonathan Leffler Sep 17 '17 at 06:52
  • The obligatory - there is no need to cast the return of `malloc`, it is unnecessary. See: [**Do I cast the result of malloc?**](http://stackoverflow.com/q/605845/995714) – David C. Rankin Sep 17 '17 at 07:06
  • @JonathanLeffler Meanwhile, GCC and Clang support Undefined Behavior Sanitizer on some platforms. When a program is compiled with it, it shows warnings about some kinds of triggered UB when executed. – Anatoly Trosinenko Sep 17 '17 at 08:35
  • Thank you @DavidC.Rankin but it should'nt be an issue even if we use a return type. – Moni Sep 18 '17 at 17:49
  • You misunderstood, `k_cluster_bdry=(int **)malloc(k*sizeof(int *));` should simply be `k_cluster_bdry=malloc(k*sizeof(*k_cluster_bdry));` – David C. Rankin Sep 18 '17 at 20:12

0 Answers0