Reading 1GB file into memory is big memory and I/O overhead. Although I'm not very familiar with AVX2
, i read articles from Internet & i could come up with the following solution which is actually tested and proved to be working.
My solution consists of reading the file as chuncks of 512 Bytes (Blocks of 128 floats) then summing up the pairs of vectors (16 Total vectors per block) so that at the end we get a final __m256
vector, by casting it to a float*
we could sum up its individual components to get the final result.
A case where the file is not 128-floats aligned is handled in the last for
loop by summing up individual floats.
The code is commented but in case you have any suggestions to add more explanation to the answer then feel free to do so.
#include <immintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
int make_floatf(char *, int);
float avx_sfadd(char*);
char error_buf[1024];
#define PERROR() \
do { \
strerror_r(errno, error_buf, 1024); \
printf("Error: %s\n", error_buf); \
fclose(fp); \
return -1; \
} while(0)
/* This function generates a .bin file containing blocks
* of 128 floating point numbers
*/
int make_floatf(char *filename, int nblocks)
{
FILE *fp = NULL;
if(!(fp = fopen(filename, "wb+")))
PERROR();
float *block_ptr = malloc(sizeof(float) * 128); /* 512 Bytes block of 128 floats */
if(!block_ptr)
PERROR();
int j, i;
for(j = 0; j < nblocks; j++)
{
for(i = 0; i < 128; i++)
block_ptr[i] = 1.0;
int ret = fwrite(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
{
free(block_ptr);
PERROR();
}
}
free(block_ptr);
fclose(fp);
return 0;
}
/* This function reads the .bin file as chuncks of 512B
* blocks (128 floating point numbers) and calculates thier sum.
* The final sum in a form of vector is looped through and its
* components are summed up to get the final result.
*/
float avx_sfadd(char *filename)
{
FILE *fp = NULL;
__m256 v1;
__m256 v2;
__m256 sum = _mm256_setzero_ps();
if(!(fp = fopen(filename, "rb")))
PERROR();
struct stat stat_buf;
stat(filename, &stat_buf);
size_t fsize = stat_buf.st_size;
size_t nblocks = fsize / (sizeof(float) * 128);
size_t rem_size = fsize - nblocks * sizeof(float) * 128;
size_t rem_floats = rem_size / (sizeof(float));
printf("File size: %ld\nnblocks:%ld\nnremfloats: %ld\n",\
fsize, nblocks, rem_floats);
/* This memory area will hold the 128 floating point numbers per block */
float *block_ptr = malloc(sizeof(float) * 128);
if(!block_ptr)
PERROR();
int i;
for(i = 0; i < nblocks; i++)
{
int ret = fread(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
PERROR();
/* Summing up vectors in a block of 16 vectors (128 floats) */
int j;
for(j = 0; j < 16; j += 2)
{
v1 = _mm256_loadu_ps(block_ptr + j*8);
v2 = _mm256_loadu_ps(block_ptr + (j+1)*8);
sum += _mm256_add_ps(v1, v2);
}
}
/* Handling the case if the last chunck of the file doesn't make
* a complete block.
*/
float rem_sum = 0;
if(rem_size > 0)
{
int ret = fread(block_ptr, 1, rem_size, fp);
if(ret < rem_floats)
PERROR();
int j;
for(j = 0; j < rem_floats; j++)
rem_sum += block_ptr[j];
}
float final_sum = rem_sum;
float *sum_ptr = (float*)∑ /* The final vector hold the sum of all vectors */
/* Summing up the values of the last vector to get the final result */
int k;
for(k = 0; k < 8; k++)
final_sum += sum_ptr[k];
free(block_ptr);
fclose(fp);
return final_sum;
}
int main(int argc, char **argv)
{
if(argc < 2){
puts("./main filename [nblocks]");
return 0;
}
/* ./main filename number_of_block_to_create (eg. ./main floats.bin 1024 )*/
else if(argc == 3){
if(!make_floatf(argv[1], atoi(argv[2])))
puts("File has been created sucessfully\n");
}
/* ./main filename (eg. ./main floats.bin) to calculate sum*/
else
printf("avx_sum = %f\n", avx_sfadd(argv[1])) :
return 0;
}