The paper that you referenced is a previous work from my group. Since you asked about copyright, the code is available under GPL2 from our group webpage: http://padas.ices.utexas.edu/software
A new code (pvfmm, which I am working on) is also available for download from the above link and is available under GPL3. This one has a simpler and very efficient algorithm for balancing.
The algorithm is described in this paper: http://padas.ices.utexas.edu/static/papers/pvfmm.pdf
I have adapted the code below to remove distributed memory parallelism.
The function balanceOctree
implements the algorithm.
#include <vector>
#include <set>
#include <cstdlib>
#include <cmath>
#include <stdint.h>
#include <omp.h>
#include <algorithm>
#include <cstring>
#ifndef MAX_DEPTH
#define MAX_DEPTH 30
#endif
#if MAX_DEPTH < 7
#define UINT_T uint8_t
#define INT_T int8_t
#elif MAX_DEPTH < 15
#define UINT_T uint16_t
#define INT_T int16_t
#elif MAX_DEPTH < 31
#define UINT_T uint32_t
#define INT_T int32_t
#elif MAX_DEPTH < 63
#define UINT_T uint64_t
#define INT_T int64_t
#endif
namespace omp_par{
template <class T,class StrictWeakOrdering>
void merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering comp){
typedef typename std::iterator_traits<T>::difference_type _DiffType;
typedef typename std::iterator_traits<T>::value_type _ValType;
_DiffType N1=A_last-A_;
_DiffType N2=B_last-B_;
if(N1==0 && N2==0) return;
if(N1==0 || N2==0){
_ValType* A=(N1==0? &B_[0]: &A_[0]);
_DiffType N=(N1==0? N2 : N1 );
#pragma omp parallel for
for(int i=0;i<p;i++){
_DiffType indx1=( i *N)/p;
_DiffType indx2=((i+1)*N)/p;
memcpy(&C_[indx1], &A[indx1], (indx2-indx1)*sizeof(_ValType));
}
return;
}
//Split both arrays ( A and B ) into n equal parts.
//Find the position of each split in the final merged array.
int n=10;
_ValType* split=new _ValType[p*n*2];
_DiffType* split_size=new _DiffType[p*n*2];
#pragma omp parallel for
for(int i=0;i<p;i++){
for(int j=0;j<n;j++){
int indx=i*n+j;
_DiffType indx1=(indx*N1)/(p*n);
split [indx]=A_[indx1];
split_size[indx]=indx1+(std::lower_bound(B_,B_last,split[indx],comp)-B_);
indx1=(indx*N2)/(p*n);
indx+=p*n;
split [indx]=B_[indx1];
split_size[indx]=indx1+(std::lower_bound(A_,A_last,split[indx],comp)-A_);
}
}
//Find the closest split position for each thread that will
//divide the final array equally between the threads.
_DiffType* split_indx_A=new _DiffType[p+1];
_DiffType* split_indx_B=new _DiffType[p+1];
split_indx_A[0]=0;
split_indx_B[0]=0;
split_indx_A[p]=N1;
split_indx_B[p]=N2;
#pragma omp parallel for
for(int i=1;i<p;i++){
_DiffType req_size=(i*(N1+N2))/p;
int j=std::lower_bound(&split_size[0],&split_size[p*n],req_size,std::less<_DiffType>())-&split_size[0];
if(j>=p*n)
j=p*n-1;
_ValType split1 =split [j];
_DiffType split_size1=split_size[j];
j=(std::lower_bound(&split_size[p*n],&split_size[p*n*2],req_size,std::less<_DiffType>())-&split_size[p*n])+p*n;
if(j>=2*p*n)
j=2*p*n-1;
if(abs(split_size[j]-req_size)<abs(split_size1-req_size)){
split1 =split [j];
split_size1=split_size[j];
}
split_indx_A[i]=std::lower_bound(A_,A_last,split1,comp)-A_;
split_indx_B[i]=std::lower_bound(B_,B_last,split1,comp)-B_;
}
delete[] split;
delete[] split_size;
//Merge for each thread independently.
#pragma omp parallel for
for(int i=0;i<p;i++){
T C=C_+split_indx_A[i]+split_indx_B[i];
std::merge(A_+split_indx_A[i],A_+split_indx_A[i+1],B_+split_indx_B[i],B_+split_indx_B[i+1],C,comp);
}
delete[] split_indx_A;
delete[] split_indx_B;
}
template <class T,class StrictWeakOrdering>
void merge_sort(T A,T A_last,StrictWeakOrdering comp){
typedef typename std::iterator_traits<T>::difference_type _DiffType;
typedef typename std::iterator_traits<T>::value_type _ValType;
int p=omp_get_max_threads();
_DiffType N=A_last-A;
if(N<2*p){
std::sort(A,A_last,comp);
return;
}
//Split the array A into p equal parts.
_DiffType* split=new _DiffType[p+1];
split[p]=N;
#pragma omp parallel for
for(int id=0;id<p;id++){
split[id]=(id*N)/p;
}
//Sort each part independently.
#pragma omp parallel for
for(int id=0;id<p;id++){
std::sort(A+split[id],A+split[id+1],comp);
}
//Merge two parts at a time.
_ValType* B=new _ValType[N];
_ValType* A_=&A[0];
_ValType* B_=&B[0];
for(int j=1;j<p;j=j*2){
for(int i=0;i<p;i=i+2*j){
if(i+j<p){
merge(A_+split[i],A_+split[i+j],A_+split[i+j],A_+split[(i+2*j<=p?i+2*j:p)],B_+split[i],p,comp);
}else{
#pragma omp parallel for
for(int k=split[i];k<split[p];k++)
B_[k]=A_[k];
}
}
_ValType* tmp_swap=A_;
A_=B_;
B_=tmp_swap;
}
//The final result should be in A.
if(A_!=&A[0]){
#pragma omp parallel for
for(int i=0;i<N;i++)
A[i]=A_[i];
}
//Free memory.
delete[] split;
delete[] B;
}
template <class T>
void merge_sort(T A,T A_last){
typedef typename std::iterator_traits<T>::value_type _ValType;
merge_sort(A,A_last,std::less<_ValType>());
}
template <class T, class I>
T reduce(T* A, I cnt){
T sum=0;
#pragma omp parallel for reduction(+:sum)
for(I i = 0; i < cnt; i++)
sum+=A[i];
return sum;
}
template <class T, class I>
void scan(T* A, T* B,I cnt){
int p=omp_get_max_threads();
if(cnt<(I)100*p){
for(I i=1;i<cnt;i++)
B[i]=B[i-1]+A[i-1];
return;
}
I step_size=cnt/p;
#pragma omp parallel for
for(int i=0; i<p; i++){
int start=i*step_size;
int end=start+step_size;
if(i==p-1) end=cnt;
if(i!=0)B[start]=0;
for(I j=(I)start+1; j<(I)end; j++)
B[j]=B[j-1]+A[j-1];
}
T* sum=new T[p];
sum[0]=0;
for(int i=1;i<p;i++)
sum[i]=sum[i-1]+B[i*step_size-1]+A[i*step_size-1];
#pragma omp parallel for
for(int i=1; i<p; i++){
int start=i*step_size;
int end=start+step_size;
if(i==p-1) end=cnt;
T sum_=sum[i];
for(I j=(I)start; j<(I)end; j++)
B[j]+=sum_;
}
delete[] sum;
}
}
class MortonId{
public:
MortonId();
MortonId(MortonId m, unsigned char depth);
template <class T>
MortonId(T x_f,T y_f, T z_f, unsigned char depth=MAX_DEPTH);
template <class T>
MortonId(T* coord, unsigned char depth=MAX_DEPTH);
unsigned int GetDepth() const;
template <class T>
void GetCoord(T* coord);
MortonId NextId() const;
MortonId getAncestor(unsigned char ancestor_level) const;
/**
* \brief Returns the deepest first descendant.
*/
MortonId getDFD(unsigned char level=MAX_DEPTH) const;
void NbrList(std::vector<MortonId>& nbrs,unsigned char level, int periodic) const;
std::vector<MortonId> Children() const;
int operator<(const MortonId& m) const;
int operator>(const MortonId& m) const;
int operator==(const MortonId& m) const;
int operator!=(const MortonId& m) const;
int operator<=(const MortonId& m) const;
int operator>=(const MortonId& m) const;
int isAncestor(MortonId const & other) const;
private:
UINT_T x,y,z;
unsigned char depth;
};
inline MortonId::MortonId():x(0), y(0), z(0), depth(0){}
inline MortonId::MortonId(MortonId m, unsigned char depth_):x(m.x), y(m.y), z(m.z), depth(depth_){}
template <class T>
inline MortonId::MortonId(T x_f,T y_f, T z_f, unsigned char depth_): depth(depth_){
UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH);
x=(UINT_T)floor(x_f*max_int);
y=(UINT_T)floor(y_f*max_int);
z=(UINT_T)floor(z_f*max_int);
}
template <class T>
inline MortonId::MortonId(T* coord, unsigned char depth_): depth(depth_){
UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH);
x=(UINT_T)floor(coord[0]*max_int);
y=(UINT_T)floor(coord[1]*max_int);
z=(UINT_T)floor(coord[2]*max_int);
}
template <class T>
inline void MortonId::GetCoord(T* coord){
UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH);
T s=1.0/((T)max_int);
coord[0]=x*s;
coord[1]=y*s;
coord[2]=z*s;
}
inline unsigned int MortonId::GetDepth() const{
return depth;
}
inline MortonId MortonId::NextId() const{
MortonId m=*this;
UINT_T mask=((UINT_T)1)<<(MAX_DEPTH-depth);
int i;
for(i=depth;i>=0;i--){
m.x=(m.x^mask);
if((m.x & mask))
break;
m.y=(m.y^mask);
if((m.y & mask))
break;
m.z=(m.z^mask);
if((m.z & mask))
break;
mask=(mask<<1);
}
m.depth=i;
return m;
}
inline MortonId MortonId::getAncestor(unsigned char ancestor_level) const{
MortonId m=*this;
m.depth=ancestor_level;
UINT_T mask=(((UINT_T)1)<<(MAX_DEPTH))-(((UINT_T)1)<<(MAX_DEPTH-ancestor_level));
m.x=(m.x & mask);
m.y=(m.y & mask);
m.z=(m.z & mask);
return m;
}
inline MortonId MortonId::getDFD(unsigned char level) const{
MortonId m=*this;
m.depth=level;
return m;
}
inline int MortonId::operator<(const MortonId& m) const{
if(x==m.x && y==m.y && z==m.z) return depth<m.depth;
UINT_T x_=(x^m.x);
UINT_T y_=(y^m.y);
UINT_T z_=(z^m.z);
if((z_>x_ || ((z_^x_)<x_ && (z_^x_)<z_)) && (z_>y_ || ((z_^y_)<y_ && (z_^y_)<z_)))
return z<m.z;
if(y_>x_ || ((y_^x_)<x_ && (y_^x_)<y_))
return y<m.y;
return x<m.x;
}
inline int MortonId::operator>(const MortonId& m) const{
if(x==m.x && y==m.y && z==m.z) return depth>m.depth;
UINT_T x_=(x^m.x);
UINT_T y_=(y^m.y);
UINT_T z_=(z^m.z);
if((z_>x_ || ((z_^x_)<x_ && (z_^x_)<z_)) && (z_>y_ || ((z_^y_)<y_ && (z_^y_)<z_)))
return z>m.z;
if((y_>x_ || ((y_^x_)<x_ && (y_^x_)<y_)))
return y>m.y;
return x>m.x;
}
inline int MortonId::operator==(const MortonId& m) const{
return (x==m.x && y==m.y && z==m.z && depth==m.depth);
}
inline int MortonId::operator!=(const MortonId& m) const{
return !(*this==m);
}
inline int MortonId::operator<=(const MortonId& m) const{
return !(*this>m);
}
inline int MortonId::operator>=(const MortonId& m) const{
return !(*this<m);
}
inline int MortonId::isAncestor(MortonId const & other) const {
return other.depth>depth && other.getAncestor(depth)==*this;
}
void MortonId::NbrList(std::vector<MortonId>& nbrs, unsigned char level, int periodic) const{
nbrs.clear();
static int dim=3;
static int nbr_cnt=(int)(pow(3.0,dim)+0.5);
static UINT_T maxCoord=(((UINT_T)1)<<(MAX_DEPTH));
UINT_T mask=maxCoord-(((UINT_T)1)<<(MAX_DEPTH-level));
UINT_T pX=x & mask;
UINT_T pY=y & mask;
UINT_T pZ=z & mask;
MortonId mid_tmp;
mask=(((UINT_T)1)<<(MAX_DEPTH-level));
for(int i=0; i<nbr_cnt; i++){
INT_T dX = ((i/1)%3-1)*mask;
INT_T dY = ((i/3)%3-1)*mask;
INT_T dZ = ((i/9)%3-1)*mask;
INT_T newX=(INT_T)pX+dX;
INT_T newY=(INT_T)pY+dY;
INT_T newZ=(INT_T)pZ+dZ;
if(!periodic){
if(newX>=0 && newX<(INT_T)maxCoord)
if(newY>=0 && newY<(INT_T)maxCoord)
if(newZ>=0 && newZ<(INT_T)maxCoord){
mid_tmp.x=newX; mid_tmp.y=newY; mid_tmp.z=newZ;
mid_tmp.depth=level;
nbrs.push_back(mid_tmp);
}
}else{
if(newX<0) newX+=maxCoord; if(newX>=(INT_T)maxCoord) newX-=maxCoord;
if(newY<0) newY+=maxCoord; if(newY>=(INT_T)maxCoord) newY-=maxCoord;
if(newZ<0) newZ+=maxCoord; if(newZ>=(INT_T)maxCoord) newZ-=maxCoord;
mid_tmp.x=newX; mid_tmp.y=newY; mid_tmp.z=newZ;
mid_tmp.depth=level;
nbrs.push_back(mid_tmp);
}
}
}
std::vector<MortonId> MortonId::Children() const{
static int dim=3;
static int c_cnt=(1UL<<dim);
static UINT_T maxCoord=(((UINT_T)1)<<(MAX_DEPTH));
std::vector<MortonId> child(c_cnt);
UINT_T mask=maxCoord-(((UINT_T)1)<<(MAX_DEPTH-depth));
UINT_T pX=x & mask;
UINT_T pY=y & mask;
UINT_T pZ=z & mask;
mask=(((UINT_T)1)<<(MAX_DEPTH-(depth+1)));
for(int i=0; i<c_cnt; i++){
child[i].x=pX+mask*((i/1)%2);
child[i].y=pY+mask*((i/2)%2);
child[i].z=pZ+mask*((i/4)%2);
child[i].depth=depth+1;
}
return child;
}
//list must be sorted.
void lineariseList(std::vector<MortonId> & list) {
if(!list.empty()) {// Remove duplicates and ancestors.
std::vector<MortonId> tmp;
for(unsigned int i = 0; i < (list.size()-1); i++) {
if( (!(list[i].isAncestor(list[i+1]))) && (list[i] != list[i+1]) ) {
tmp.push_back(list[i]);
}
}
tmp.push_back(list[list.size()-1]);
list.swap(tmp);
}
}
void balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out,
unsigned int maxDepth, bool periodic) {
unsigned int dim=3;
int omp_p=omp_get_max_threads();
if(in.size()==1){
out=in;
return 0;
}
//Build level-by-level set of nodes.
std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p);
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t a=( p *in.size())/omp_p;
size_t b=((p+1)*in.size())/omp_p;
for(size_t i=a;i<b;){
size_t d=in[i].GetDepth();
if(d==0){i++; continue;}
MortonId pnode=in[i].getAncestor(d-1);
nodes[d-1+(maxDepth+1)*p].insert(pnode);
while(i<b && d==in[i].GetDepth() && pnode==in[i].getAncestor(d-1)) i++;
}
//Add new nodes level-by-level.
std::vector<MortonId> nbrs;
unsigned int num_chld=1UL<<dim;
for(unsigned int l=maxDepth;l>=1;l--){
//Build set of parents of balancing nodes.
std::set<MortonId> nbrs_parent;
std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin();
std::set<MortonId>::iterator end =nodes[l+(maxDepth+1)*p].end();
for(std::set<MortonId>::iterator node=start; node != end;){
node->NbrList(nbrs, l, periodic);
int nbr_cnt=nbrs.size();
for(int i=0;i<nbr_cnt;i++)
nbrs_parent.insert(nbrs[i].getAncestor(l-1));
node++;
}
//Get the balancing nodes.
std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
start=nbrs_parent.begin();
end =nbrs_parent.end();
ancestor_nodes.insert(start,end);
}
//Remove ancestors nodes. (optional)
for(unsigned int l=1;l<=maxDepth;l++){
std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
for(std::set<MortonId>::iterator node=start; node != end; node++){
MortonId parent=node->getAncestor(node->GetDepth()-1);
ancestor_nodes.erase(parent);
}
}
}
//Resize out.
std::vector<size_t> node_cnt(omp_p,0);
std::vector<size_t> node_dsp(omp_p,0);
#pragma omp parallel for
for(int i=0;i<omp_p;i++){
for(unsigned int j=0;j<=maxDepth;j++)
node_cnt[i]+=nodes[j+i*(maxDepth+1)].size();
}
omp_par::scan(&node_cnt[0],&node_dsp[0], omp_p);
out.resize(node_cnt[omp_p-1]+node_dsp[omp_p-1]);
//Copy leaf nodes to out.
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t node_iter=node_dsp[p];
for(unsigned int l=0;l<=maxDepth;l++){
std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
for(std::set<MortonId>::iterator node=start; node != end; node++)
out[node_iter++]=*node;
}
}
//Sort, Linearise, Redistribute.
omp_par::merge_sort(&out[0], &out[out.size()]);
lineariseList(out);
{ // Add children
MortonId nxt_mid(0,0,0,0);
std::vector<MortonId> out1;
std::vector<MortonId> children;
for(size_t i=0;i<out.size();i++){
while(nxt_mid.getDFD()<out[i]){
while(nxt_mid.isAncestor(out[i])){
nxt_mid=nxt_mid.getAncestor(nxt_mid.GetDepth()+1);
}
out1.push_back(nxt_mid);
nxt_mid=nxt_mid.NextId();
}
children=out[i].Children();
for(size_t j=0;j<8;j++){
out1.push_back(children[j]);
}
nxt_mid=out[i].NextId();
}
while(nxt_mid.GetDepth()>0){
out1.push_back(nxt_mid);
nxt_mid=nxt_mid.NextId();
}
out.swap(out1);
}
}
The above code is optimized for performance, and OpenMP further complicates things, but the basic idea is this:
- Build a level-by-level set of tree nodes (MortonIds).
- Loop over the levels from the finest level to the root level.
- At each level, loop over all tree node at that level and compute MortonId for its parent's neighbours. Add these to the set of tree nodes at the coarser level (making sure not to add duplicate nodes).
- Finally, take all the tree nodes (from all levels), and sort them by their MortonId.
- Remove ancestor nodes to obtain the balanced linear octree.