Based on @Alex Cohn answer, I have implemented it in the JNI part, trying to take profit from the byte-access and performance advantages. I left it here, maybe it could be as useful as the @Alex answer was for me. It's almost the same algorithm, in C; based in an image with YUV_420_888 format:
uchar* yuvToNV21(jbyteArray yBuf, jbyteArray uBuf, jbyteArray vBuf, jbyte *fullArrayNV21,
int width, int height, int yRowStride, int yPixelStride, int uRowStride,
int uPixelStride, int vRowStride, int vPixelStride, JNIEnv *env) {
/* Check that our frame has right format, as specified at android docs for
* YUV_420_888 (https://developer.android.com/reference/android/graphics/ImageFormat?authuser=2#YUV_420_888):
* - Plane Y not overlaped with UV, and always with pixelStride = 1
* - Planes U and V have the same rowStride and pixelStride (overlaped or not)
*/
if(yPixelStride != 1 || uPixelStride != vPixelStride || uRowStride != vRowStride) {
jclass Exception = env->FindClass("java/lang/Exception");
env->ThrowNew(Exception, "Invalid YUV_420_888 byte structure. Not agree with https://developer.android.com/reference/android/graphics/ImageFormat?authuser=2#YUV_420_888");
}
int ySize = width*height;
int uSize = env->GetArrayLength(uBuf);
int vSize = env->GetArrayLength(vBuf);
int newArrayPosition = 0; //Posicion por la que vamos rellenando el array NV21
if (fullArrayNV21 == nullptr) {
fullArrayNV21 = new jbyte[ySize + uSize + vSize];
}
if(yRowStride == width) {
//Best case. No padding, copy direct
env->GetByteArrayRegion(yBuf, newArrayPosition, ySize, fullArrayNV21);
newArrayPosition = ySize;
}else {
// Padding at plane Y. Copy Row by Row
long yPlanePosition = 0;
for(; newArrayPosition<ySize; newArrayPosition += width) {
env->GetByteArrayRegion(yBuf, yPlanePosition, width, fullArrayNV21 + newArrayPosition);
yPlanePosition += yRowStride;
}
}
// Check UV channels in order to know if they are overlapped (best case)
// If they are overlapped, U and B first bytes are consecutives and pixelStride = 2
long uMemoryAdd = (long)&uBuf;
long vMemoryAdd = (long)&vBuf;
long diff = std::abs(uMemoryAdd - vMemoryAdd);
if(vPixelStride == 2 && diff == 8) {
if(width == vRowStride) {
// Best Case: Valid NV21 representation (UV overlapped, no padding). Copy direct
env->GetByteArrayRegion(uBuf, 0, uSize, fullArrayNV21 + ySize);
env->GetByteArrayRegion(vBuf, 0, vSize, fullArrayNV21 + ySize + uSize);
}else {
// UV overlapped, but with padding. Copy row by row (too much performance improvement compared with copy byte-by-byte)
int limit = height/2 - 1;
for(int row = 0; row<limit; row++) {
env->GetByteArrayRegion(uBuf, row * vRowStride, width, fullArrayNV21 + ySize + (row * width));
}
}
}else {
//WORST: not overlapped UV. Copy byte by byte
for(int row = 0; row<height/2; row++) {
for(int col = 0; col<width/2; col++) {
int vuPos = col*uPixelStride + row*uRowStride;
env->GetByteArrayRegion(vBuf, vuPos, 1, fullArrayNV21 + newArrayPosition);
newArrayPosition++;
env->GetByteArrayRegion(uBuf, vuPos, 1, fullArrayNV21 + newArrayPosition);
newArrayPosition++;
}
}
}
return (uchar*)fullArrayNV21;
}
I'm sure that some improvements can be added, but I have tested in a lot of devices, and it is working with very good performance and stability.