When we do, with numpy (in Python):
my_array[::4] = 0
is it:
for (int i = 0; i < my_array_size; i += 4) {
my_array[i] = 0;
}
in C?
Or is there SIMD done by numpy?
If so, what are actual cases where rewriting Python code in C is useful, since I wouldn't expect anyone (unless they have a lot of knowledge) to be able to do SIMD manually in C (or C++ for that matter)?
I'm asking this question because I can't make the Sieve of Eratosthenes significantly faster on C than on Python and I was curious as to why. Here are my implementations (Jérôme Richard suggested using bits instead of bytes for the boolean array which is a good idea):
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <stdint.h>
typedef struct
{
int *primes;
int size;
} Result;
Result EratosthenesSieveC(int n)
{
uint8_t *nbs = (uint8_t *)malloc((n + 1) / 8 + 1);
int i, j, count = 0;
double limit = sqrt(n);
// Initialize the array with true everywhere except for 0 and 1
nbs[0] = nbs[1] = false;
for (i = 2; i <= n; i++)
{
nbs[i / 8] |= 1 << (i % 8);
}
// Apply the Sieve of Eratosthenes algorithm
for (i = 2; i <= limit; i++)
{
if (nbs[i / 8] & (1 << (i % 8)))
{
for (j = i * i; j <= n; j += i)
{
if (nbs[j / 8] & (1 << (j % 8)))
{
nbs[j / 8] &= ~(1 << (j % 8));
count++; // how many numbers are not prime
}
}
}
}
// Allocate memory for the array of primes
int *primes = (int *)malloc((n + 1 - count - 2) * sizeof(int));
// Store the prime numbers in the 'primes' array
int index = 0;
for (i = 2; i <= n; i++)
{
if (nbs[i / 8] & (1 << (i % 8)))
{
primes[index++] = i;
}
}
// Free the memory used by the 'nbs' array
free(nbs);
Result result;
result.primes = primes;
result.size = n - count - 1;
return result; // Note: I never really deallocate the memory of result which might be causing memory leaks??
}
Numpy:
def EratosthenesSieveFullNumpy(n):
nbs = np.ones(n+1, dtype=bool)
nbs[:2] = 0
for i in range(2, int(n**0.5)+1):
if nbs[i]:
nbs[i*i::i] = 0
return np.where(nbs)[0]
It's worth noting this implementation is really bad:
def EratosthenesSieveNumpy(n):
nbs = np.ones(n+1, dtype=bool)
nbs[:2] = 0
for i in range(2, int(n**0.5)+1):
if nbs[i]:
for j in range(i*i, n+1, i):
nbs[j] = 0
return np.where(nbs)[0]
Which is why I suspected SIMD to be the reason why I couldn't make my C code faster.
To import my C code in Python I used:
class Result(ctypes.Structure):
_fields_ = [('primes', ctypes.POINTER(ctypes.c_int)), ('size', ctypes.c_int)]
lib = ctypes.cdll.LoadLibrary(my_path_of_compiled_shared_library)
lib.EratosthenesSieveC.restype = Result
def EratosthenesSieveC(n, lib):
result = lib.EratosthenesSieveC(n)
return result.primes[:result.size]
My results are: https://i.imgur.com/1vUJDv7.png