My question is about performance of using AVX instructions versus a naive approach.
I am getting the same — and correct — answer from my AVX approach as I am getting from my naive approach, but the answer takes slightly longer with AVX instructions, so I'd like to know what I'm doing wrong/inefficiently with vectorized code.
This question is a bit too complex to offer a self-contained compilable unit of code, and I'm sorry about that. However, I have functional code snippets below that I hope are fairly straightforward and decently styled, which are hopefully easy enough to follow to deal with the question at hand.
Some environment details:
- Both approaches are compiled with Clang (Apple LLVM version 8.1.0 (clang-802.0.42)).
- I am compiling with the
-mavx
flag. - My hardware (MacBook Pro with an Intel Core i7 processor) claims to support AVX instructions.
I have a program where a user provides a multiline text file, each line containing a comma-separated string of numbers, i.e., a list of n-dimensional vectors, where n is arbitrary for the file, but (barring bad input) is the same value n for every line.
For example:
0,4,6,1,2,22,0,2,30,...,39,14,0,3,3,3,1,3,0,3,2,1
0,0,1,1,0,0,0,8,0,1,...,6,0,0,4,0,0,0,0,7,0,8,2,0
...
1,0,1,0,1,0,0,2,0,1,...,2,0,0,0,0,0,2,1,1,0,2,0,0
I generate some statistical scores from comparisons of these vectors, e.g., Pearson correlation, but the score function could be anything, say, something simple, like an arithmetic mean.
Naive approach
Each of these vectors is put into a pointer to a struct called signal_t
:
typedef struct signal {
uint32_t n;
score_t* data;
score_t mean;
} signal_t;
The score_t
type is just a typedef to float
:
typedef float score_t;
To start, I parse the string into float
(score_t
) values and calculate the arithmetic mean:
signal_t* s = NULL;
s = malloc(sizeof(signal_t));
if (!s) {
fprintf(stderr, "Error: Could not allocate space for signal pointer!\n");
exit(EXIT_FAILURE);
}
s->n = 1;
s->data = NULL;
s->mean = NAN;
for (uint32_t idx = 0; idx < strlen(vector_string); idx++) {
if (vector_string[idx] == ',') {
s->n++;
}
}
s->data = malloc(sizeof(*s->data) * s->n);
if (!s->data) {
fprintf(stderr, "Error: Could not allocate space for signal data pointer!\n");
exit(EXIT_FAILURE);
}
char* start = vector_string;
char* end = vector_string;
char entry_buf[ENTRY_MAX_LEN];
uint32_t entry_idx = 0;
bool finished_parsing = false;
bool data_contains_nan = false;
do {
end = strchr(start, ',');
if (!end) {
end = vector_string + strlen(vector_string);
finished_parsing = true;
}
memcpy(entry_buf, start, end - start);
entry_buf[end - start] = '\0';
sscanf(entry_buf, "%f", &s->data[entry_idx++]);
if (isnan(s->data[entry_idx - 1])) {
data_contains_nan = true;
}
start = end + 1;
} while (!finished_parsing);
if (!data_contains_nan) {
s->mean = pt_mean_signal(s->data, s->n);
}
The arithmetic mean is pretty straightforward:
score_t pt_mean_signal(score_t* d, uint32_t len)
{
score_t s = 0.0f;
for (uint32_t idx = 0; idx < len; idx++) {
s += d[idx];
}
return s / len;
}
Naive performance
On running this kind of approach on a file of 10k vector-strings, I get a runtime of 6.58s.
AVX approach
I have a modified signal_t
struct called signal_avx_t
:
typedef struct signal_avx {
uint32_t n_raw;
uint32_t n;
__m256* data;
score_t mean;
} signal_avx_t;
This stores pointers to __m256
addresses. Each __m256
stores eight single-precision float
values. For convenience, I define a constant called AVX_FLOAT_N
to store this multiple, e.g.:
#define AVX_FLOAT_N 8
Here is how I am parsing the vector-string and storing it in a __m256
. It is very similar to the naive approach, except now I read values eight at a time into a buffer, write the buffer to an __m256
, and repeat until there are no more values to write. I then calculate the mean:
signal_avx_t* s = NULL;
s = malloc(sizeof(signal_avx_t));
if (!s) {
fprintf(stderr, "Error: Could not allocate space for signal_avx pointer!\n");
exit(EXIT_FAILURE);
}
s->n_raw = 1;
s->n = 0;
s->data = NULL;
s->mean = NAN;
for (uint32_t idx = 0; idx < strlen(vector_string); idx++) {
if (vector_string[idx] == ',') {
s->n_raw++;
}
}
score_t signal_buf[AVX_FLOAT_N];
s->n = (uint32_t) ceil((float)(s->n_raw) / AVX_FLOAT_N);
s->data = malloc(sizeof(*s->data) * s->n);
if (!s->data) {
fprintf(stderr, "Error: Could not allocate space for signal_avx data pointer!\n");
exit(EXIT_FAILURE);
}
char* start = id;
char* end = id;
char entry_buf[ENTRY_MAX_LEN];
uint32_t entry_idx = 0;
uint32_t data_idx = 0;
bool finished_parsing = false;
bool data_contains_nan = false;
do {
end = strchr(start, ',');
if (!end) {
end = vector_string + strlen(vector_string);
finished_parsing = true;
}
memcpy(entry_buf, start, end - start);
entry_buf[end - start] = '\0';
sscanf(entry_buf, "%f", &signal_buf[entry_idx++ % AVX_FLOAT_N]);
if (isnan(signal_buf[(entry_idx - 1) % AVX_FLOAT_N])) {
data_contains_nan = true;
}
start = end + 1;
/* I write every eight floats to an __m256 chunk of memory */
if (entry_idx % AVX_FLOAT_N == 0) {
s->data[data_idx++] = _mm256_setr_ps(signal_buf[0],
signal_buf[1],
signal_buf[2],
signal_buf[3],
signal_buf[4],
signal_buf[5],
signal_buf[6],
signal_buf[7]);
}
} while (!finished_parsing);
if (!data_contains_nan) {
/* write any leftover floats to the last `__m256` */
if (entry_idx % AVX_FLOAT_N != 0) {
for (uint32_t idx = entry_idx % AVX_FLOAT_N; idx < AVX_FLOAT_N; idx++) {
signal_buf[idx] = 0;
}
s->data[data_idx++] = _mm256_setr_ps(signal_buf[0],
signal_buf[1],
signal_buf[2],
signal_buf[3],
signal_buf[4],
signal_buf[5],
signal_buf[6],
signal_buf[7]);
}
s->mean = pt_mean_signal_avx(s->data, s->n, s->n_raw);
}
AVX mean function
Here is the function I wrote to generate the arithmetic mean:
score_t pt_mean_signal_avx(__m256* d, uint32_t len, uint32_t len_raw)
{
score_t s = 0.0f;
/* initialize a zero-value vector to collect summed value */
__m256 v_sum = _mm256_setzero_ps();
/* add data to collector */
for (uint32_t idx = 0; idx < len; idx++) {
v_sum = _mm256_add_ps(v_sum, d[idx]);
}
/* sum the collector values */
score_t* res = (score_t*)&v_sum;
for (uint32_t idx = 0; idx < AVX_FLOAT_N; idx++) {
s += res[idx];
}
return s / len_raw;
}
AVX performance
On running the AVX-based approach on a file of 10k vector-strings, I get a runtime of 6.86s, about 5% slower. This difference is roughly constant, regardless of the size of the input.
Summary
My expectation was that, by using AVX instructions and by vectorizing loops, I'd get a speed bump, not that the performance would be marginally worse.
Is there anything in the code snippets that suggests misuse of the __m256
datatype and related intrinsic functions, for the purposes of calculating a basic summary statistic?
Mainly, I'd like to figure out what I'm doing wrong here, before progressing to more complicated scoring functions between larger datasets. Thanks for any constructive advice!