I tried to implement Fortune's algorithm for Voronoi diagram generating on the plane using native STL trees with transparent comparators. Almost all the bugs found at the moment and asymptotic complexity of the implementation is O(N * log(N))
. Now I want to try to make some optimizations. Using results of profiling (I use pprof
from Google Performance Tools) I conclude, that there are two really hot functions. And both of them express quite idiomatic math algorithms: calculation of an ordinate of a two parabolic arcs' intersection for a given common directrix and calculation of center and radius of circumscribed circle by 3 points. Second one is of interest. The circumscribed circle for 3 points (sites in terms of Voronoi diagram) denote Voronoi diagram vertex and so called "circle event" in terms of Fortune's algorithm. The function (based on these formulae) for vertex finding is below:
using value_type = double;
struct alignas(__m128d) point
{
value_type x, y;
};
std::experimental::optional< vertex >
make_vertex(point const & a,
point const & b,
point const & c)
{
#if 1
value_type const A = a.x * a.x + a.y * a.y;
value_type const B = b.x * b.x + b.y * b.y;
value_type const C = c.x * c.x + c.y * c.y;
point const ca = {a.x - c.x, a.y - c.y};
point const cb = {b.x - c.x, b.y - c.y};
value_type const CA = A - C;
value_type const CB = B - C;
value_type x = CA * cb.y - CB * ca.y;
value_type y = ca.x * CB - cb.x * CA;
value_type alpha = ca.x * cb.y - ca.y * cb.x;
if (!(eps < -alpha)) {
return {};
}
value_type beta = a.x * (b.y * C - c.y * B) - b.x * (a.y * C - c.y * A) + c.x * (a.y * B - b.y * A);
beta /= alpha;
alpha += alpha;
x /= alpha;
y /= alpha;
using std::sqrt;
value_type const R = sqrt(beta + x * x + y * y);
return {{{x, y}, R}};
#else
__m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
__m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
__m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
__m256d A = _mm256_mul_pd(a_, a_);
A = _mm256_hadd_pd(A, A);
__m256d B = _mm256_mul_pd(b_, b_);
B = _mm256_hadd_pd(B, B);
__m256d C = _mm256_mul_pd(c_, c_);
C = _mm256_hadd_pd(C, C);
__m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
__m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
__m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
xxyy = _mm256_hsub_pd(xxyy, xxyy);
__m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
__m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
alpha = _mm256_hsub_pd(alpha, alpha);
if (!(alpha[0] < -eps)) {
return {};
}
__m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
__m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
__m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
bacc = _mm256_div_pd(bacc, alpha);
xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
__m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
beta = _mm256_hadd_pd(beta, beta);
beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
return {{{xyxy[0], xyxy[1]}, beta[0]}};
#endif
}
As you can see I made an attemption to reimplement this function using intrisics for Intel AVX instructions. Both #if 1
and #if 0
parts are exactly equivalent and correct. #if 0
uses a full width of ymm registers most of the time. Only at the very end the higher half of some variables contain a garbage.
I made a couple of tests for a huge input (100000 points uniformely distributed on a limited part of the plane): one for #if 1
and another for #if 0
. The first one gives better runtime (0.28 seconds vs 0.31 seconds average for many tries). I looked into the disassembly for #if 1
:
push %rbx
sub $0x10,%rsp
mov %rdi,%rbx
319 [1] value_type const A = a.x * a.x + a.y * a.y;
vmovsd (%rdx),%xmm11
vmovsd 0x8(%rdx),%xmm0
320 [1] value_type const B = b.x * b.x + b.y * b.y;
vmovsd (%rcx),%xmm9
vmovsd 0x8(%rcx),%xmm2
321 [1] value_type const C = c.x * c.x + c.y * c.y;
vmovsd (%r8),%xmm8
vmovsd 0x8(%r8),%xmm6
322 [1] point const ca = {a.x - c.x, a.y - c.y};
vunpcklpd %xmm9,%xmm0,%xmm1
vunpcklpd %xmm8,%xmm6,%xmm5
vsubpd %xmm5,%xmm1,%xmm7
323 [1] point const cb = {b.x - c.x, b.y - c.y};
vunpcklpd %xmm11,%xmm2,%xmm1
vsubpd %xmm5,%xmm1,%xmm1
328 [1] value_type alpha = ca.x * cb.y - ca.y * cb.x;
vpermilpd $0x1,%xmm1,%xmm5
vmulsd %xmm1,%xmm5,%xmm5
vpermilpd $0x1,%xmm7,%xmm3
vmulsd %xmm7,%xmm3,%xmm3
vsubsd %xmm3,%xmm5,%xmm12
329 [1] if (!(eps < -alpha)) {
mov (%rsi),%rax
vxorpd 0x1cd1(%rip),%xmm12,%xmm3 # 0x40e680
vucomisd (%rax),%xmm3
jbe 0x40ca85 <make_vertex()+309>
319 [2] value_type const A = a.x * a.x + a.y * a.y;
vunpcklpd %xmm9,%xmm11,%xmm3
vmulpd %xmm3,%xmm3,%xmm10
vunpcklpd %xmm2,%xmm0,%xmm3
vmulpd %xmm3,%xmm3,%xmm3
vaddpd %xmm3,%xmm10,%xmm3
321 [2] value_type const C = c.x * c.x + c.y * c.y;
vmulsd %xmm8,%xmm8,%xmm10
vmulsd %xmm6,%xmm6,%xmm4
vaddsd %xmm4,%xmm10,%xmm4
324 [1] value_type const CA = A - C;
vmovddup %xmm4,%xmm5
vsubpd %xmm5,%xmm3,%xmm5
326 [1] value_type x = CA * cb.y - CB * ca.y;
vmulpd %xmm5,%xmm1,%xmm1
vpermilpd $0x1,%xmm5,%xmm5
vmulpd %xmm5,%xmm7,%xmm5
vsubpd %xmm5,%xmm1,%xmm10
332 [1] value_type beta = a.x * (b.y * C - c.y * B) + b.x * (c.y * A - a.y * C) + c.x * (a.y * B - b.y * A);
vmulsd %xmm4,%xmm2,%xmm5
vpermilpd $0x1,%xmm3,%xmm7
vmulsd %xmm7,%xmm6,%xmm1
vsubsd %xmm1,%xmm5,%xmm1
vmulsd %xmm1,%xmm11,%xmm1
vmulsd %xmm6,%xmm3,%xmm5
vmulsd %xmm4,%xmm0,%xmm4
vsubsd %xmm4,%xmm5,%xmm4
vmulsd %xmm4,%xmm9,%xmm4
vaddsd %xmm4,%xmm1,%xmm1
vmulsd %xmm7,%xmm0,%xmm0
vmulsd %xmm3,%xmm2,%xmm2
vsubsd %xmm2,%xmm0,%xmm0
vmulsd %xmm0,%xmm8,%xmm0
vaddsd %xmm1,%xmm0,%xmm0
333 [1] beta /= alpha;
vdivsd %xmm12,%xmm0,%xmm0
334 [1] alpha += alpha;
vaddsd %xmm12,%xmm12,%xmm1
335 [1] x /= alpha;
vmovddup %xmm1,%xmm1
vdivpd %xmm1,%xmm10,%xmm2
338 [1] value_type const R = sqrt(beta + x * x + y * y);
vmulsd %xmm2,%xmm2,%xmm1
vaddsd %xmm0,%xmm1,%xmm0
vpermilpd $0x1,%xmm2,%xmm1
vmulsd %xmm1,%xmm1,%xmm1
vaddsd %xmm0,%xmm1,%xmm1
vsqrtsd %xmm1,%xmm1,%xmm0
370 [1] }
%rbx,%rax
$0x10,%rsp
%rbx
retq
and for #if 0
:
317 [1] {
vmovupd (%rdx),%xmm2
vmovupd (%rcx),%xmm1
vmovupd (%r8),%xmm4
350 [1] __m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
vsubpd %xmm4,%xmm2,%xmm0
vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
vsubpd %xmm4,%xmm1,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
vpermilpd $0x1,%ymm3,%ymm3
vblendpd $0x6,%ymm0,%ymm3,%ymm5
355 [1] __m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
vperm2f128 $0x1,%ymm0,%ymm5,%ymm0
vmulpd %ymm0,%ymm5,%ymm0
356 [1] alpha = _mm256_hsub_pd(alpha, alpha);
vhsubpd %ymm0,%ymm0,%ymm0
357 [1] if (!(alpha[0] < -eps)) {
mov (%rsi),%rax
vmovsd (%rax),%xmm3
vxorpd 0x1cc6(%rip),%xmm3,%xmm3 # 0x40e660
vucomisd %xmm0,%xmm3
jbe 0x40ca6d <make_vertex()+285>
343 [1] __m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
vinsertf128 $0x1,%xmm4,%ymm4,%ymm6
344 [1] __m256d A = _mm256_mul_pd(a_, a_);
vmulpd %xmm2,%xmm2,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
345 [1] A = _mm256_hadd_pd(A, A);
vhaddpd %ymm3,%ymm3,%ymm3
346 [1] __m256d B = _mm256_mul_pd(b_, b_);
vmulpd %xmm1,%xmm1,%xmm7
vinsertf128 $0x1,%xmm7,%ymm7,%ymm7
347 [1] B = _mm256_hadd_pd(B, B);
vhaddpd %ymm7,%ymm7,%ymm7
348 [1] __m256d C = _mm256_mul_pd(c_, c_);
vmulpd %xmm4,%xmm4,%xmm4
vinsertf128 $0x1,%xmm4,%ymm4,%ymm4
349 [1] C = _mm256_hadd_pd(C, C);
vhaddpd %ymm4,%ymm4,%ymm4
351 [1] __m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
vunpcklpd %ymm7,%ymm3,%ymm8
vsubpd %ymm4,%ymm8,%ymm8
vpermilpd $0x6,%ymm8,%ymm8
352 [1] __m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
vmulpd %ymm8,%ymm5,%ymm5
353 [1] xxyy = _mm256_hsub_pd(xxyy, xxyy);
vhsubpd %ymm5,%ymm5,%ymm5
342 [1] __m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
341 [1] __m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
354 [1] __m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
vperm2f128 $0x23,%ymm5,%ymm0,%ymm9
vunpcklpd %ymm9,%ymm5,%ymm5
360 [1] __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
vpermilpd $0x9,%ymm2,%ymm2
vpermilpd $0x1,%xmm1,%xmm1
vinsertf128 $0x1,%xmm1,%ymm6,%ymm1
vmulpd %ymm1,%ymm2,%ymm1
361 [1] __m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
vpermilpd $0x1,%ymm6,%ymm2
vmulpd %ymm2,%ymm8,%ymm2
362 [1] __m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
vinsertf128 $0x1,%xmm4,%ymm7,%ymm4
vmulpd %ymm4,%ymm1,%ymm1
vmulpd %ymm2,%ymm3,%ymm2
vhsubpd %ymm2,%ymm1,%ymm1
vpermilpd $0x2,%ymm1,%ymm1
363 [1] bacc = _mm256_div_pd(bacc, alpha);
vdivpd %ymm0,%ymm1,%ymm1
364 [1] xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
vaddpd %ymm0,%ymm0,%ymm0
vdivpd %ymm0,%ymm5,%ymm0
365 [1] __m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
vmulpd %ymm0,%ymm0,%ymm2
vhaddpd %ymm2,%ymm1,%ymm2
366 [1] beta = _mm256_hadd_pd(beta, beta);
vhaddpd %ymm2,%ymm2,%ymm2
367 [1] beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
vperm2f128 $0x1,%ymm0,%ymm1,%ymm1
vaddpd %ymm1,%ymm2,%ymm1
vsqrtpd %ymm1,%ymm1
370 [1] }
mov %rdi,%rax
vzeroupper
retq
I use clang version 4.0.0 (trunk 282862)
, crucial command line keys: -O3 -march=native -mavx
on Intel(R) Core(TM) i7-2670QM CPU
.
What are the main misconceptions of mine here?
Should I use inline assembly instead for better control?