I am trying to implement k-means algorithm on CUDA using Tesla card on external Unix. I read input file and store coordinates of all data points in dataX and dataY arrays. The next step is to select every centreInterval-th point and store it in another array allocated in GPU memory. However, I have no idea how may I even check what's the problem if all I can get is 'Segmentation error' and from obvious reasons can't print any kind of output from kernel.
EDIT 2: I simplified this example to the shortest possible solution. I found my solution during process, but decided to provide the version, which was not solved yet in this question to make more clear what caused the problem.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#define BLOCK_SIZE 16
// My kernel - Selects some centres at the beginning of algorithm and stores it at appropriate place
__global__ void kMeansSelectInitialCentres(float* d_dataX, float* d_dataY, float* d_centresX, float* d_centresY, int centreInterval) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int idx = i * centreInterval;
d_centresX[i] = d_dataX[idx];
d_centresY[i] = d_dataY[idx];
}
// Simplified example
int main(int argn, char ** argc) {
// My data - let's say it is 32 floats in each
int dataSize = 32;
float* dataX = new float[dataSize];
float* dataY = new float[dataSize];
// Fill arrays with numbers
for (int i = 0; i < dataSize; i++) {
dataX[i] = i;
dataY[i] = i;
}
// Interval - we select first number, then 1 + N * centreInterval
int centreInterval = 2;
// There I will store my results in program
int centreSize = dataSize / centreInterval;
float* centresX = new float[centreSize];
float* centresY = new float[centreSize];
// Pointers to the arrays stored in GPU memory
float* d_dataX;
float* d_dataY;
float* d_centresX;
float* d_centresY;
// Allocate memory for those arrays
// Calculate how much space in memory do we need for this
size_t d_centreSize = sizeof(float) * centreSize;
size_t d_dataSize = sizeof(float) * dataSize;
// Memory for raw data
cudaMalloc((void**)&d_dataX, d_dataSize);
cudaMalloc((void**)&d_dataY, d_dataSize);
// Copy raw data to the device memory so we can operate on it freely
cudaMemcpy(d_dataY, dataY, d_dataSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataX, dataX, d_dataSize, cudaMemcpyHostToDevice);
// Memory for centre results
cudaMalloc((void**)&d_centresX, d_dataSize);
cudaMalloc((void**)&d_centresY, d_dataSize);
// Call kernel
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGridK((centreSize + dimBlock.x) / dimBlock.x);
kMeansSelectInitialCentres <<<dimGridK, dimBlock>>> (d_dataX, d_dataY, d_centresX, d_centresY, centreInterval);
// Check results - we get every n-th point
float* check_x = new float[centreSize];
float* check_y = new float[centreSize];
cudaMemcpy(check_x, d_centresX, d_dataSize, cudaMemcpyDeviceToHost);
cudaMemcpy(check_y, d_centresY, d_dataSize, cudaMemcpyDeviceToHost);
printf("X: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_x[i]);
printf("\nY: ");
for (int i = 0; i < centreSize; i++)
printf("%.2f ", check_y[i]);
printf("\n");
}
Main question: What is wrong with this kernel / check-out of data?
Side question: Is there any fair way to debug program kernels in such situations?