This is only going to help if you're doing something like this many times. I used Agner Fog's vectorclass to do this (http://www.agner.org/optimize/vectorclass.zip). This is a class to use SSE/AVX. But you'll find the best answer if you add the tags SSE and AVX to your question.
You'll also get better results if you can insure the arrays are 16 byte or 32 byte aligned. In the code below it would also help to make either the width of the arrays equal to 64 (even if you are only going to use 60 elements) or to make the length of the array a multiple of 64.
#include <stdio.h>
#include "vectorclass.h"
void foo(int InArray[2][60], short OutArray[60]) {
for (int i=0; i < 60; i++) {
OutArray[i] = (short)(InArray[0][i] & 0xffff);
}
}
void foo_vec8s(int InArray[2][60], short OutArray[60]) {
int i=0;
for (; i <(60-8); i+=8) {
Vec8s v1 = Vec8s().load(&InArray[0][i]);
Vec8s v2 = Vec8s().load(&InArray[0][i+4]);
Vec8s out = blend8s<0,2,4,6,8,10,12,14>(v1,v2);
out.store(&OutArray[i]);
}
//clean up since arrays are not a multiple of 64
for (;i < 60; i++) {
OutArray[i] = (short)(InArray[0][i] & 0xffff);
}
}
int main() {
int InArray[2][60];
for(int i=0; i<60; i++) {
InArray[0][i] = i | 0xffff0000;
}
short OutArray1[60] = {0};
foo(InArray, OutArray1);
for(int i=0; i<60; i++) {
printf("%d ", OutArray1[i]);
} printf("\n");
short OutArray2[60] = {0};
foo_vec8s(InArray, OutArray2);
for(int i=0; i<60; i++) {
printf("%d ", OutArray2[i]);
} printf("\n");
}