Circumscribed circle parameters using AVX intrinsics

Question

I tried to implement Fortune's algorithm for Voronoi diagram generating on the plane using native STL trees with transparent comparators. Almost all the bugs found at the moment and asymptotic complexity of the implementation is O(N * log(N)). Now I want to try to make some optimizations. Using results of profiling (I use pprof from Google Performance Tools) I conclude, that there are two really hot functions. And both of them express quite idiomatic math algorithms: calculation of an ordinate of a two parabolic arcs' intersection for a given common directrix and calculation of center and radius of circumscribed circle by 3 points. Second one is of interest. The circumscribed circle for 3 points (sites in terms of Voronoi diagram) denote Voronoi diagram vertex and so called "circle event" in terms of Fortune's algorithm. The function (based on these formulae) for vertex finding is below:

using value_type = double;

struct alignas(__m128d) point
{
    value_type x, y;
};

std::experimental::optional< vertex >
make_vertex(point const & a,
            point const & b,
            point const & c)
{
#if 1
    value_type const A = a.x * a.x + a.y * a.y;
    value_type const B = b.x * b.x + b.y * b.y;
    value_type const C = c.x * c.x + c.y * c.y;
    point const ca = {a.x - c.x, a.y - c.y};
    point const cb = {b.x - c.x, b.y - c.y};
    value_type const CA = A - C;
    value_type const CB = B - C;
    value_type x = CA * cb.y - CB * ca.y;
    value_type y = ca.x * CB - cb.x * CA;
    value_type alpha = ca.x * cb.y - ca.y * cb.x;
    if (!(eps < -alpha)) {
        return {};
    }
    value_type beta = a.x * (b.y * C - c.y * B) - b.x * (a.y * C - c.y * A) + c.x * (a.y * B - b.y * A);
    beta /= alpha;
    alpha += alpha;
    x /= alpha;
    y /= alpha;
    using std::sqrt;
    value_type const R = sqrt(beta + x * x + y * y);
    return {{{x, y}, R}};
#else
    __m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
    __m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
    __m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
    __m256d A = _mm256_mul_pd(a_, a_);
    A = _mm256_hadd_pd(A, A);
    __m256d B = _mm256_mul_pd(b_, b_);
    B = _mm256_hadd_pd(B, B);
    __m256d C = _mm256_mul_pd(c_, c_);
    C = _mm256_hadd_pd(C, C);
    __m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
    __m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
    __m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
    xxyy = _mm256_hsub_pd(xxyy, xxyy);
    __m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
    __m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
    alpha = _mm256_hsub_pd(alpha, alpha);
    if (!(alpha[0] < -eps)) {
        return {};
    }
    __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
    __m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
    __m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
    bacc = _mm256_div_pd(bacc, alpha);
    xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
    __m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
    beta = _mm256_hadd_pd(beta, beta);
    beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
    return {{{xyxy[0], xyxy[1]}, beta[0]}};
#endif
}

As you can see I made an attemption to reimplement this function using intrisics for Intel AVX instructions. Both #if 1 and #if 0 parts are exactly equivalent and correct. #if 0 uses a full width of ymm registers most of the time. Only at the very end the higher half of some variables contain a garbage.

I made a couple of tests for a huge input (100000 points uniformely distributed on a limited part of the plane): one for #if 1 and another for #if 0. The first one gives better runtime (0.28 seconds vs 0.31 seconds average for many tries). I looked into the disassembly for #if 1:

push   %rbx
sub    $0x10,%rsp
mov    %rdi,%rbx
        319 [1]            value_type const A = a.x * a.x + a.y * a.y;
vmovsd (%rdx),%xmm11
vmovsd 0x8(%rdx),%xmm0
        320 [1]            value_type const B = b.x * b.x + b.y * b.y;
vmovsd (%rcx),%xmm9
vmovsd 0x8(%rcx),%xmm2
        321 [1]            value_type const C = c.x * c.x + c.y * c.y;
vmovsd (%r8),%xmm8
vmovsd 0x8(%r8),%xmm6
        322 [1]            point const ca = {a.x - c.x, a.y - c.y};
vunpcklpd %xmm9,%xmm0,%xmm1
vunpcklpd %xmm8,%xmm6,%xmm5
vsubpd %xmm5,%xmm1,%xmm7
        323 [1]            point const cb = {b.x - c.x, b.y - c.y};
vunpcklpd %xmm11,%xmm2,%xmm1
vsubpd %xmm5,%xmm1,%xmm1
        328 [1]            value_type alpha = ca.x * cb.y - ca.y * cb.x;
vpermilpd $0x1,%xmm1,%xmm5
vmulsd %xmm1,%xmm5,%xmm5
vpermilpd $0x1,%xmm7,%xmm3
vmulsd %xmm7,%xmm3,%xmm3
vsubsd %xmm3,%xmm5,%xmm12
        329 [1]            if (!(eps < -alpha)) {
mov    (%rsi),%rax
vxorpd 0x1cd1(%rip),%xmm12,%xmm3        # 0x40e680
vucomisd (%rax),%xmm3
jbe    0x40ca85 <make_vertex()+309>
        319 [2]            value_type const A = a.x * a.x + a.y * a.y;
vunpcklpd %xmm9,%xmm11,%xmm3
vmulpd %xmm3,%xmm3,%xmm10
vunpcklpd %xmm2,%xmm0,%xmm3
vmulpd %xmm3,%xmm3,%xmm3
vaddpd %xmm3,%xmm10,%xmm3
        321 [2]            value_type const C = c.x * c.x + c.y * c.y;
vmulsd %xmm8,%xmm8,%xmm10
vmulsd %xmm6,%xmm6,%xmm4
vaddsd %xmm4,%xmm10,%xmm4
        324 [1]            value_type const CA = A - C;
vmovddup %xmm4,%xmm5
vsubpd %xmm5,%xmm3,%xmm5
        326 [1]            value_type x = CA * cb.y - CB * ca.y;
vmulpd %xmm5,%xmm1,%xmm1
vpermilpd $0x1,%xmm5,%xmm5
vmulpd %xmm5,%xmm7,%xmm5
vsubpd %xmm5,%xmm1,%xmm10
        332 [1]            value_type beta = a.x * (b.y * C - c.y * B) + b.x * (c.y * A - a.y * C) + c.x * (a.y * B - b.y * A);
vmulsd %xmm4,%xmm2,%xmm5
vpermilpd $0x1,%xmm3,%xmm7
vmulsd %xmm7,%xmm6,%xmm1
vsubsd %xmm1,%xmm5,%xmm1
vmulsd %xmm1,%xmm11,%xmm1
vmulsd %xmm6,%xmm3,%xmm5
vmulsd %xmm4,%xmm0,%xmm4
vsubsd %xmm4,%xmm5,%xmm4
vmulsd %xmm4,%xmm9,%xmm4
vaddsd %xmm4,%xmm1,%xmm1
vmulsd %xmm7,%xmm0,%xmm0
vmulsd %xmm3,%xmm2,%xmm2
vsubsd %xmm2,%xmm0,%xmm0
vmulsd %xmm0,%xmm8,%xmm0
vaddsd %xmm1,%xmm0,%xmm0
        333 [1]            beta /= alpha;
vdivsd %xmm12,%xmm0,%xmm0
        334 [1]            alpha += alpha;
vaddsd %xmm12,%xmm12,%xmm1
        335 [1]            x /= alpha;
vmovddup %xmm1,%xmm1
vdivpd %xmm1,%xmm10,%xmm2
        338 [1]            value_type const R = sqrt(beta + x * x + y * y);
vmulsd %xmm2,%xmm2,%xmm1
vaddsd %xmm0,%xmm1,%xmm0
vpermilpd $0x1,%xmm2,%xmm1
vmulsd %xmm1,%xmm1,%xmm1
vaddsd %xmm0,%xmm1,%xmm1
vsqrtsd %xmm1,%xmm1,%xmm0
        370 [1]        }
%rbx,%rax
$0x10,%rsp
%rbx
retq

and for #if 0:

        317 [1]        {
vmovupd (%rdx),%xmm2
vmovupd (%rcx),%xmm1
vmovupd (%r8),%xmm4
        350 [1]            __m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
vsubpd %xmm4,%xmm2,%xmm0
vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
vsubpd %xmm4,%xmm1,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
vpermilpd $0x1,%ymm3,%ymm3
vblendpd $0x6,%ymm0,%ymm3,%ymm5
        355 [1]            __m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
vperm2f128 $0x1,%ymm0,%ymm5,%ymm0
vmulpd %ymm0,%ymm5,%ymm0
        356 [1]            alpha = _mm256_hsub_pd(alpha, alpha);
vhsubpd %ymm0,%ymm0,%ymm0
        357 [1]            if (!(alpha[0] < -eps)) {
mov    (%rsi),%rax
vmovsd (%rax),%xmm3
vxorpd 0x1cc6(%rip),%xmm3,%xmm3        # 0x40e660
vucomisd %xmm0,%xmm3
jbe    0x40ca6d <make_vertex()+285>
        343 [1]            __m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
vinsertf128 $0x1,%xmm4,%ymm4,%ymm6
        344 [1]            __m256d A = _mm256_mul_pd(a_, a_);
vmulpd %xmm2,%xmm2,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
        345 [1]            A = _mm256_hadd_pd(A, A);
vhaddpd %ymm3,%ymm3,%ymm3
        346 [1]            __m256d B = _mm256_mul_pd(b_, b_);
vmulpd %xmm1,%xmm1,%xmm7
vinsertf128 $0x1,%xmm7,%ymm7,%ymm7
        347 [1]            B = _mm256_hadd_pd(B, B);
vhaddpd %ymm7,%ymm7,%ymm7
        348 [1]            __m256d C = _mm256_mul_pd(c_, c_);
vmulpd %xmm4,%xmm4,%xmm4
vinsertf128 $0x1,%xmm4,%ymm4,%ymm4
        349 [1]            C = _mm256_hadd_pd(C, C);
vhaddpd %ymm4,%ymm4,%ymm4
        351 [1]            __m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
vunpcklpd %ymm7,%ymm3,%ymm8
vsubpd %ymm4,%ymm8,%ymm8
vpermilpd $0x6,%ymm8,%ymm8
        352 [1]            __m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
vmulpd %ymm8,%ymm5,%ymm5
        353 [1]            xxyy = _mm256_hsub_pd(xxyy, xxyy);
vhsubpd %ymm5,%ymm5,%ymm5
        342 [1]            __m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
        341 [1]            __m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
        354 [1]            __m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
vperm2f128 $0x23,%ymm5,%ymm0,%ymm9
vunpcklpd %ymm9,%ymm5,%ymm5
        360 [1]            __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
vpermilpd $0x9,%ymm2,%ymm2
vpermilpd $0x1,%xmm1,%xmm1
vinsertf128 $0x1,%xmm1,%ymm6,%ymm1
vmulpd %ymm1,%ymm2,%ymm1
        361 [1]            __m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
vpermilpd $0x1,%ymm6,%ymm2
vmulpd %ymm2,%ymm8,%ymm2
        362 [1]            __m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
vinsertf128 $0x1,%xmm4,%ymm7,%ymm4
vmulpd %ymm4,%ymm1,%ymm1
vmulpd %ymm2,%ymm3,%ymm2
vhsubpd %ymm2,%ymm1,%ymm1
vpermilpd $0x2,%ymm1,%ymm1
        363 [1]            bacc = _mm256_div_pd(bacc, alpha);
vdivpd %ymm0,%ymm1,%ymm1
        364 [1]            xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
vaddpd %ymm0,%ymm0,%ymm0
vdivpd %ymm0,%ymm5,%ymm0
        365 [1]            __m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
vmulpd %ymm0,%ymm0,%ymm2
vhaddpd %ymm2,%ymm1,%ymm2
        366 [1]            beta = _mm256_hadd_pd(beta, beta);
vhaddpd %ymm2,%ymm2,%ymm2
        367 [1]            beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
vperm2f128 $0x1,%ymm0,%ymm1,%ymm1
vaddpd %ymm1,%ymm2,%ymm1
vsqrtpd %ymm1,%ymm1
        370 [1]        }
mov    %rdi,%rax
vzeroupper
retq

I use clang version 4.0.0 (trunk 282862), crucial command line keys: -O3 -march=native -mavx on Intel(R) Core(TM) i7-2670QM CPU.

What are the main misconceptions of mine here?

Should I use inline assembly instead for better control?

Not to be blunt, but I don't believe you. This looks very much like debug code, spilling to the stack and immediately reloading is nothing something that happens in optimized code, but it happens here *all the time*. I went ahead and compiled this myself just to confirm, and I got completely reasonable code with -O3. Not with Clang 4, I admit, I don't have it. I would be surprised if they actually have a regression this big, it would essentially render it unusable. — harold, Nov 15 '16 at 15:07
@harold I made and edition. I compiled both versions in `RelWithDebInfo` mode. Now the code in both cases looks as I expected. But what do I wrong? Why performance is different? — Tomilov Anatoliy, Nov 15 '16 at 15:54
Line table seems not well conformant with code. Sure this is due to the code is highly optimized (`-O3`). — Tomilov Anatoliy, Nov 15 '16 at 15:58
Line numbers can do odd things sometimes, probably not a big deal. Anyway, the time is still correct? You have a lot of not very fast instructions here, such horizontal operations and cross-lane things like `vinsertf128`, `vperm2f128`, perhaps you can use fewer of them — harold, Nov 15 '16 at 16:03
@harold Yes. I switch to Debug mode only for `assert` and disassembly in my IDE. All time measurements was done in Release mode — Tomilov Anatoliy, Nov 15 '16 at 16:07
@harold From [here](https://software.intel.com/sites/landingpage/IntrinsicsGuide/): vperm2f128 latency=3 throughput=1 and vinsertf128 latency=3 throughput=- (what does "-" mean I don't know). Are they really so slow? hsub latency=5 as well as mul instruction. div instruction beats them all. — Tomilov Anatoliy, Nov 15 '16 at 17:47
Well they are not super slow, but for shuffle-type instructions they're slow and it's overhead - it's not computing the result. So, you need some division and square root because it's fundamentally part of what you're computing (there are approximations too but your use of double suggests that's not OK), but perhaps you can save some of the data movement particularly the slower ones. — harold, Nov 15 '16 at 17:51
Store your data in an array of x and a separate array of y, instead of as packed `point` pairs. Then compute two vertices in parallel with only vertical operations, instead of one vertex at once with tons of shuffling and horizontal ops. VHADDPD is slow: 2 shuffle uops + an add uop. It's especially bad with just AVX1, since you need separate lane-permutes and in-lane shuffles without AVX2 VPERMPD. — Peter Cordes, Nov 15 '16 at 19:50
@PeterCordes I have no 2 vertices at once. Don't think algorithm can be parallelized. — Tomilov Anatoliy, Nov 16 '16 at 04:12
`What are the main misconceptions of mine here?`: probably that instruction-count is most important. And maybe also ignoring the amount of instruction-level parallelism available in the scalar version: The CPU can and will execute independent operations in parallel. Packing things into vectors with shuffles means extra work on the latency critical-path. — Peter Cordes, Nov 16 '16 at 04:19
Trying to use 256b vectors is probably also a mistake. It might be worth hand-vectorizing with 128b vectors and doing some scalar, some vector ops. Also, the HADD/HSUB instructions are rarely the best choice unless you're using them with two different input vectors. Your Intel CPU has a good decoded-uop cache, so worry more about uop count than instruction count. HADD is 2 shuffles + an ADD. (See http://agner.org/optimize/) — Peter Cordes, Nov 16 '16 at 04:22
Here's an example of manual shuffle+add instead of HADD: https://github.com/pcordes/endless-sky/blob/SIMD-mask/source/Point.h#L400. That's part of my improved version of a 2D `Point` class for a game, which originally was only using SIMD when SSE3 was available. For more horizontal data movement ideas, see [this SSE/AVX horizontal-sums answer](http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86/35270026#35270026) — Peter Cordes, Nov 16 '16 at 04:28
@PeterCordes Most tangled operation here is 3x3 matrix determinant (for `beta`). Can't find AVX-optimal code for 3x3 det. Can you help? — Tomilov Anatoliy, Nov 16 '16 at 04:40
@Orient: Intel published [code for a 4x4 determinant](https://software.intel.com/en-us/articles/benefits-of-intel-avx-for-small-matrices) as an example of using AVX for small matrices. Maybe something useful there. 3x3 determinant would make a reasonable SO question on its own, if it hasn't already been asked (make sure to link to this question for context, because the surrounding code matters for choosing the best option in cases like this). — Peter Cordes, Nov 16 '16 at 05:12
[This answer](http://stackoverflow.com/questions/29991373/determinant-calculation-with-simd) suggests using the scalar triple product, and says that efficient `cross()` and `dot()` implementations are available. Other stuff I found suggested that [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) might have an efficient implementation. I didn't find anything hand-specialized for Matrix3d determinants in the source, but I didn't check the compiler output for [`Matrix3d.determinant`](https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3). — Peter Cordes, Nov 16 '16 at 05:30

Circumscribed circle parameters using AVX intrinsics

0 Answers0