I have vector of 1024*4608 elements (Original_signal) which is stored in one-dimention array.
And I enlarged the Original_signal to Expand_signal by copying every 1024 elements 32 times to 1024*32*4608.
Then I use a Com_array of 1024*32 to do the element-to-element multiplication with the Expand_signal and do the 1024FFT of the After multiplying array.
The core code is like follows:
//initialize Original_signal
MKL_Complex8 *Original_signal = new MKL_Complex8[1024*4608];
for (int i=0; i<4608; i++)
{
for (int j=0; j<1024; j++)
{
Original_signal[j+i*1024].real=rand();
Original_signal[j+i*1024].imag=rand();
}
}
//Com_array
MKL_Complex8 *Com_array= new MKL_Complex8[32*1024];
for (int i=0; i<32; i++)
{
for (int j=0; j<1024; j++)
{
Com_array[j+i*1024].real=cosf(2*pi*(i-16.0)/10.0*j^2);
Com_array[j+i*1024].imag=sinf(2*pi*(i-16.0)/10.0*j^2);
}
}
//element-to-element multiplication
MKL_Complex8 *Temp_signal= new MKL_Complex8[1024*32];
MKL_Complex8 *Expand_signal= new MKL_Complex8[1024*32*4608];
gettimeofday(&Bgn_Time, 0);
for (int i=0; i<4608; i++)
{
for (int j=0; j<32; j++)
{
memcpy(Temp_signal+j*1024, Original_signal+i*1024, 1024*sizeof(MKL_Complex8));
}
vmcMul(1024*32, Temp_signal, Com_array, Expand_signal+i*1024*32);
}
gettimeofday(&End_Time, 0);
double time_used = (double)(End_Time.tv_sec-Bgn_Time.tv_sec)*1000000+(double)(End_Time.tv_usec-Bgn_Time.tv_usec);
printf("element-to-element multiplication use time %fus\n, time_used ");
//FFT
DFTI_DESCRIPTOR_HANDLE h_FFT = 0;
DftiCreateDescriptor(&h_FFT, DFTI_SINGLE, DFTI_COMPLEX, 1, 1024);
DftiSetValue(h_FFT, DFTI_NUMBER_OF_TRANSFORMS, 32*4608);
DftiSetValue(h_FFT, DFTI_INPUT_DISTANCE, 1024);
DftiCommitDescriptor(h_FFT);
gettimeofday(&Bgn_Time, 0);
DftiComputeForward(h_FFT,Expand_signal);
gettimeofday(&End_Time, 0);
double time_used = (double)(End_Time.tv_sec-Bgn_Time.tv_sec)*1000000+(double)(End_Time.tv_usec-Bgn_Time.tv_usec);
printf("FFT use time %fus\n, time_used ");
The time of element-to-element multiplication is 700ms(After removing the memcpy cost), And the time of FFT is 500ms.
The complex multiplication computation of FFT is N/2log2N And the element-to-element multiplication is N.
In this project N=1024. FFT is 5 times slower than element-to-element multiplication in theory. Why is faster in actual.
Any way to speed up the project?
(notice that Com_array is symmetrical)