I have particles moving in electrostatic potential stored as 3D grid (3D-texture) and I need to interpolate it smoothly to obtain smooth derivatives (forces). Native hardware interpolation is only trilinear, which is not good enough (derivatives are not smooth).
So I'm writing the software trilinear interpolation. It works like this:
- Read cube of 4x4x4 = 64 values form the texture memory
- interpolate each of the 16 lines along x-direction to obtain 4x4 set of points
- interpolate each of the 4 lines along y-direction to obtain 4 points
- interpolate the remaining 4 points along z-direction
As you can see the most time-consuming step is clearly reading the 64 values from global texture memory.
I know that when reading 3D-grid data from normal buffer like grid[ix+nx*(iy+ny*iz)]
then considerable performance boost can be achieved when reads close along fastest axis (i.e. x-axis) are grouped together, because these values are collocated int the same cache-line.
For example
int yz = nx*(ix+ny*iz);
float4 vals = (float4){ grid[ix+yz], grid[ix+1+yz],grid[ix+2+yz],grid[ix+3+yz] };
Now the questions are:
- Is something similar possible to do when using
image3d_t
in OpenCL (reading withread_imagef()
or some other function?) ? (I mean to do the subsequent reads in strategic order to collocate them in cash) - Or should I rather use buffer? I was thinking images are faster for random access then buffers, right ?
For completenes there is the OpenCL function, but it is not yet tested:
float8 hspline_basis( float x ){
// hermite spline with derivatives https://en.wikipedia.org/wiki/Cubic_Hermite_spline
float x2 = x*x;
float K = x2*(x - 1);
float d0 = K - x2 + x; // x3 - x2
float d1 = K ; // x3 - 2*x2 + x
return (float8){
// ------ values
d0*-0.5,
2*K - x2 + 1 - d1*-0.5, // 2*x3 - 3*x2 + 1
-2*K + x2 + d0* 0.5, // -2*x3 + 3*x2
d0*0.5,
// ----- derivatives
d0*-0.5,
2*K - d1*-0.5, // 6*x2 - 6*x
-2*K + d0* 0.5, // -6*x2 + 6*x
d0*0.5
};
}
float4 interpolate_tricubic( __read_only image3d_t im, float4 p0 ){
float4 dx=(float4){1.f,0.f,0.f,0.f};
float4 dy=(float4){0.f,1.f,0.f,0.f};
float4 dz=(float4){0.f,0.f,1.f,0.f};
float4 iu; float4 u = fract(p0,&iu);
float8 cx = hspline_basis(u.x);
float8 cy = hspline_basis(u.y);
float4 p;
p=iu -dz -dy; float2 S00= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu -dz ; float2 S01= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu -dz +dy; float2 S02= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu -dz+dy+dy; float2 S03= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S0 = S00.xyx*cy.s04.xxy + S01.xyx*cy.s15.xxy + S02.xyx*cy.s26.xxy + S03.xyx*cy.s37.xxy;
p=iu -dy; float2 S10= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu ; float2 S11= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dy; float2 S12= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dy+dy; float2 S13= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S1 = S10.xyx*cy.s04.xxy + S11.xyx*cy.s15.xxy + S12.xyx*cy.s26.xxy + S13.xyx*cy.s37.xxy;
p=iu +dz -dy; float2 S20= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dz ; float2 S21= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dz -dy; float2 S22= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dz+dy+dy; float2 S23= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S2 = S20.xyx*cy.s04.xxy + S21.xyx*cy.s15.xxy + S22.xyx*cy.s26.xxy + S23.xyx*cy.s37.xxy;
p=iu+dz+dz -dy; float2 S30= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu+dz+dz ; float2 S31= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu+dz+dz +dy; float2 S32= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu+dz+dz+dy+dy; float2 S33= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S3 = S30.xyx*cy.s04.xxy + S31.xyx*cy.s15.xxy + S32.xyx*cy.s26.xxy + S33.xyx*cy.s37.xxy;
float8 cz = hspline_basis(u.z);
return S0.xyzx*cz.s04.xxxy + S1.xyzx*cz.s15.xxxy + S2.xyzx*cz.s26.xxxy + S3.xyzx*cz.s37.xxxy;
}