Missing from the problem statement is the ability to compare the first element of A with the first element of B.
Using a linked list will allow a fast rotate. Using a merge sort should be fairly fast.
Use rotate first to last to treat A and B as queues. Treat each queue as two queues, an input (front) queue and an output (back) queue. Use counts to keep track of the boundary between input and output part of each queue. A remaining run count will be needed for each input queue for the merge logic.
For the initial pass, separate the elements into two queues, get an element from A, append to B, get another element from A, append to A. Once this is done, then A and B contain runs of size 1.
Merge runs from A and B, again alternating merged run output between A and B so when done with a merge pass, the queues are ready for the next pass.
Eventually all the elements end up on one list and the sort is done.
update - I wrote a test program using C++ std::queue for the queues, and it takes less than 0.4 seconds to sort 1 million elements (32 bit mode, 32 bit integers) on my system (Intel 3770k 3.5 ghz). A C program using linked lists would involve less overhead than std::queue, so a bit faster.
Example C++ program for proof of concept, it's using two std::queues, using front / pop / push to effectively do a rotate. Pointers aren't needed for the input structures (qs0, qs1), but it makes the input and output references consistent. The output pointer to structure (pqso) alternates between the two queue structures (qs0, qs1). There are two goto's used to branch to common "cleanup" code for the inner loop, which copies the rest of the "other" run and breaks out of the inner loop. This example was based on an old merge sort program, and since this is a "proof of concept", I didn't bother changing it.
#include <iostream>
#include <queue>
#include <cstdlib>
#include <ctime>
typedef unsigned int uint32_t;
#define min(a, b) (((a) < (b)) ? (a) : (b))
typedef struct{
std::queue <uint32_t> *pq; // ptr to queue
size_t fcnt; // front count
size_t bcnt; // back count
size_t rcnt; // run count
}QS;
void msort2q(std::queue <uint32_t> &q0, std::queue <uint32_t> &q1, size_t n)
{
QS qs0; // queue 0
QS qs1; // queue 1
QS *pqsi0 = &qs0; // input 0
QS *pqsi1 = &qs1; // input 1
QS *pqso; // output
size_t s; // run size
if(n < 2)
return;
pqsi0->pq = &q0;
pqsi0->fcnt = n;
pqsi0->bcnt = 0;
pqsi1->pq = &q1;
pqsi1->fcnt = 0;
pqsi1->bcnt = 0;
pqso = pqsi1;
while(1){ // split q0
pqso->pq->push(pqsi0->pq->front());
pqsi0->pq->pop();
pqso->bcnt++;
pqsi0->fcnt--;
if(pqsi0->fcnt == 0)
break;
pqso = (pqso == pqsi0) ? pqsi1 : pqsi0;
}
pqsi0->fcnt = pqsi0->bcnt;
pqsi0->bcnt = 0;
pqsi1->fcnt = pqsi1->bcnt;
pqsi1->bcnt = 0;
s = 1;
while(s < n){ // merge sort
while(1){ // merge a pair of runs
pqsi0->rcnt = min(s, pqsi0->fcnt);
pqsi0->fcnt -= pqsi0->rcnt;
pqsi1->rcnt = min(s, pqsi1->fcnt);
pqsi1->fcnt -= pqsi1->rcnt;
if(pqsi0->rcnt == 0) // if end run 0
goto copy1; // copy rest of 1 and break
if(pqsi1->rcnt == 0) // if end run 1
goto copy0; // copy rest of 0 and break
while(1){ // merge an element
if(pqsi0->pq->front() <= pqsi1->pq->front()){ // if q0 <= q1
pqso->pq->push(pqsi0->pq->front()); // move q0
pqso->bcnt++;
pqsi0->pq->pop();
pqsi0->rcnt--;
if(pqsi0->rcnt != 0) // if not end run 0
continue; // continue back to while
copy1: do{ // else copy rest of run 1 and break
pqso->pq->push(pqsi1->pq->front());
pqso->bcnt++;
pqsi1->pq->pop();
pqsi1->rcnt--;
}while (pqsi1->rcnt);
break;
} else { // else q1 < q0
pqso->pq->push(pqsi1->pq->front()); // move q1
pqso->bcnt++;
pqsi1->pq->pop();
pqsi1->rcnt--;
if(pqsi1->rcnt != 0) // if not end run 1
continue; // continue back to while
copy0: do{ // else copy rest of run 0 and break
pqso->pq->push(pqsi0->pq->front());
pqso->bcnt++;
pqsi0->pq->pop();
pqsi0->rcnt--;
}while (pqsi0->rcnt);
break;
}
}
pqso = (pqso == pqsi0) ? pqsi1 : pqsi0; // setup for next pair
if(pqsi0->fcnt == 0 && pqsi1->fcnt == 0) // if end pass break
break;
}
pqsi0->fcnt = pqsi0->bcnt; // setup for next pass
pqsi0->bcnt = 0;
pqsi1->fcnt = pqsi1->bcnt;
pqsi1->bcnt = 0;
s *= 2;
}
if(pqsi0->fcnt == 0) // swap if needed
std::swap(q0, q1);
}
#define NUMELEM (1024*1024)
int main()
{
clock_t dwTimeStart; // clock values
clock_t dwTimeStop;
uint32_t r;
size_t i;
std::queue <uint32_t> q0;
std::queue <uint32_t> q1;
// generate data
for(i = 0; i < NUMELEM; i++ )
{
r = (((uint32_t)((rand()>>4) & 0xff))<< 0);
r += (((uint32_t)((rand()>>4) & 0xff))<< 8);
r += (((uint32_t)((rand()>>4) & 0xff))<<16);
r += (((uint32_t)((rand()>>4) & 0xff))<<24);
q0.push(r) ;
}
// sort data
dwTimeStart = clock();
msort2q(q0, q1, NUMELEM);
dwTimeStop = clock();
std::cout << "Number of ticks " << (dwTimeStop-dwTimeStart) << std::endl;
// check data
while(1){
r = q0.front();
q0.pop();
if(q0.size() == 0)
break;
if(r > q0.front()){
std::cout << "error" << std::endl;
break;
}
}
return 0;
}