OK, here's my native code that evolved after much banging of the head.
My difficulty was that I didn't understand planar image formats until I saw this and this:

Here are the 2 functions I eventually wrote:
// rotate luma image plane 90*
//
// (dst direction)
// ------>
// dst -> +-------------+
// |^ |
// |^ (base dir) |
// |^ |
// base -> +-------------+ <- endp
//
//////////////////////////////////////////////////////////
void rotateLumaPlane90(const unsigned char *src, unsigned char *dst,
size_t size, size_t width, size_t height)
{
const unsigned char *endp;
const unsigned char *base;
int j;
endp = src + size;
for (base = endp - width; base < endp; base++) {
src = base;
for (j = 0; j < height; j++, src -= width)
{
*dst++ = *src;
}
}
}
//
// nv12 chroma plane is interleaved chroma values that map
// from one pair of chroma to 4 pixels:
//
// Y1 Y2 Y3 Y4
// Y5 Y6 Y7 Y8 U1,V1 -> chroma values for block Y1 Y2
// Y9 Ya Yb Yc Y5 Y6
// Yd Ye Yf Yg
// ----------- U2,V2 -> chroma values for block Y3 Y4
// U1 V1 U2 V2 Y7 Y8
// U3 V3 U4 V4
//
//////////////////////////////////////////////////////////
void rotateChromaPlane90(const unsigned char *src, unsigned char *dst,
size_t size, size_t width, size_t height)
{
// src will start at upper right, moving down to bottom
// then left 1 col and down...
//
// dest will start at end and go to 0
int row = 0;
int col = (int) width;
int src_offset = col - 1;
int dst_offset = (int) size - 2;
while (src_offset >= 0)
{
dst[dst_offset] = src[src_offset];
dst[dst_offset+1] = src[src_offset+1];
dst_offset -= 2;
src_offset += width;
row++;
if (row >= height) {
col -= 2;
src_offset = col;
row = 0;
}
}
}
And here is a sample of me calling these funcs from android native:
// first rotate the Y plane
rotateLumaPlane90((unsigned char *) encode_buffer,
rotate_buffer,
yPlaneSize,
gInputWidth,
gInputHeight);
// now rotate the U and V planes
rotateChromaPlane90((unsigned char *) encode_buffer + yPlaneSize,
rotate_buffer + yPlaneSize,
yPlaneSize / 2,
gInputWidth,
gInputHeight/2);
Notice the last param to the rotateChromaPlane90
is the height of the original image/2. I should probably just change the chroma rotate function to make that less error-prone.
When flipped to the back facing camera I then found I needed to rotate 90* in the opposite direction (or 270*) so I also have a 270* variation as:
// rotate luma image plane 270*
//
// +-------------+
// |^ |
// |^ (base dir) |
// |^ |
// base -> +-------------+ <- endp
// ^
// <---------- |
// (dst dir) dst
//
//////////////////////////////////////////////////////////
void rotateLumaPlane270(unsigned char *src,
register unsigned char *dst,
int size, int width, int height)
{
unsigned char *endp;
register unsigned char *base;
int j;
endp = src + size;
dst = dst + size - 1;
for (base = endp - width; base < endp; base++) {
src = base;
for (j = 0; j < height; j++, src -= width)
{
*dst-- = *src;
}
}
}
//
// nv21 chroma plane is interleaved chroma values that map
// from one pair of chroma to 4 pixels:
//
// Y1 Y2 Y3 Y4
// Y5 Y6 Y7 Y8 U1,V1 -> chroma values for block Y1 Y2
// Y9 Ya Yb Yc Y5 Y6
// Yd Ye Yf Yg
// ----------- U2,V2 -> chroma values for block Y3 Y4
// U1 V1 U2 V2 Y7 Y8
// U3 V3 U4 V4
//
//////////////////////////////////////////////////////////
void rotateChromaPlane270(unsigned char *src,
register unsigned char *dst,
int size, int width, int height)
{
// src will start at upper right, moving down to bottom
// then left 1 col and down...
//
// dest will start at 0 and go til end
int row = 0;
int col = width;
int src_offset = col - 1;
int dst_offset = 0;
while (src_offset > 0)
{
dst[dst_offset++] = src[src_offset];
dst[dst_offset++] = src[src_offset+1];
src_offset += width;
row++;
if (row >= height) {
col -= 2;
src_offset = col;
row = 0;
}
}
}