0

I wrote a method that is supposed to multiply a vector with a matrix and write the result to an output vector like this:

void __attribute__ ((optimize ("-O3", "-ftree-vectorize" )))
myMethod ( double * matrix , double const * vectorIn, double * vectorOut )
{
   int numVertices = 1000;

   for ( int v = 0; v < numVertices; ++v )
   {
      double const * inVertex  = vectorIn;
      double       * outVertex = vectorOut;

      for ( int i = 0;i < 4; i++ )
      {
         outVertex [ i ] = ( matrix [ i ]     ) * inVertex[ 0 ]
                         + ( matrix [ i + 1 ] ) * inVertex[ 1 ]
                         + ( matrix [ i + 2 ] ) * inVertex[ 2 ]
                         + ( matrix [ i + 3 ] ) * inVertex[ 3 ];
      }
   }
}

However although I compile with O3 and ftree-vectorize it will not produce neon instructions:

0x00000000004006c0 <+0>:    mov w3, #0x3e8                  // #1000
0x00000000004006c4 <+4>:    ldp d6, d7, [x1]
0x00000000004006c8 <+8>:    subs    w3, w3, #0x1
0x00000000004006cc <+12>:   ldp d2, d3, [x0]
0x00000000004006d0 <+16>:   ldp d5, d4, [x1,#16]
0x00000000004006d4 <+20>:   ldp d1, d0, [x0,#16]
0x00000000004006d8 <+24>:   fmul    d3, d3, d7
0x00000000004006dc <+28>:   fmadd   d2, d2, d6, d3
0x00000000004006e0 <+32>:   fmadd   d1, d1, d5, d2
0x00000000004006e4 <+36>:   fmadd   d0, d0, d4, d1
0x00000000004006e8 <+40>:   str d0, [x2]
0x00000000004006ec <+44>:   ldp d6, d7, [x1]
0x00000000004006f0 <+48>:   ldp d2, d3, [x0,#8]
0x00000000004006f4 <+52>:   ldp d5, d4, [x1,#16]
0x00000000004006f8 <+56>:   ldp d1, d0, [x0,#24]
0x00000000004006fc <+60>:   fmul    d3, d3, d7
0x0000000000400700 <+64>:   fmadd   d2, d2, d6, d3
0x0000000000400704 <+68>:   fmadd   d1, d1, d5, d2
0x0000000000400708 <+72>:   fmadd   d0, d0, d4, d1
0x000000000040070c <+76>:   str d0, [x2,#8]
0x0000000000400710 <+80>:   ldp d6, d7, [x1]
0x0000000000400714 <+84>:   ldp d2, d3, [x0,#16]
0x0000000000400718 <+88>:   ldp d1, d0, [x0,#32]
0x000000000040071c <+92>:   ldp d5, d4, [x1,#16]
0x0000000000400720 <+96>:   fmul    d3, d3, d7
0x0000000000400724 <+100>:  fmadd   d2, d2, d6, d3
0x0000000000400728 <+104>:  fmadd   d1, d1, d5, d2
0x000000000040072c <+108>:  fmadd   d0, d0, d4, d1
0x0000000000400730 <+112>:  str d0, [x2,#16]
0x0000000000400734 <+116>:  ldp d2, d3, [x0,#24]
0x0000000000400738 <+120>:  ldp d6, d7, [x1]
0x000000000040073c <+124>:  ldr d1, [x0,#40]
0x0000000000400740 <+128>:  ldp d5, d4, [x1,#16]
0x0000000000400744 <+132>:  ldr d0, [x0,#48]
0x0000000000400748 <+136>:  fmul    d3, d3, d7
0x000000000040074c <+140>:  fmadd   d2, d2, d6, d3
0x0000000000400750 <+144>:  fmadd   d1, d1, d5, d2
0x0000000000400754 <+148>:  fmadd   d0, d0, d4, d1
0x0000000000400758 <+152>:  str d0, [x2,#24]
0x000000000040075c <+156>:  b.ne    0x4006c4 <_Z13transformClipPdPKdS_+4>
0x0000000000400760 <+160>:  ret

Strangely, if I move the inner loop to a separate method, it will produce an optimized path with neon function calls.

The gcc is an Aarch64 gcc for arm aarch64-linux-gnu-g++ (Linaro GCC 6.3-2017.05) 6.3.1 20170404.

Can someone explain me why?

Regards

Desperado17
  • 835
  • 6
  • 12
  • The compiler uses heuristics to decide whether to optimize some code or not. In one case the heuristic decided to vectorize, int he other case it didn't. There may be some vectorization reports you want to have a look at to figure this one out. Probably has to do with the size of the function. – Matthieu Brucher Nov 29 '18 at 11:26
  • Do you know an error message that suggests such a thing? I could post the output of -fopt-info-vec-all but it is too long. – Desperado17 Nov 29 '18 at 11:34
  • Yes, that output would tell you what the compiler decided for what function and why. – Matthieu Brucher Nov 29 '18 at 11:55
  • Why beg in C if you can give orders in assembly? If you can't, learning assembly is way more time efficient than checking the disassembly in the end. And the final results are always way better as well. – Jake 'Alquimista' LEE Nov 29 '18 at 23:24
  • You might have problems with inputs aliasing outputs, so might need to litter a few "restrict" keywords around the place. – solidpixel Nov 30 '18 at 15:46
  • Ok, it does work with restrict and if I invoke O3 from the command line. Is there something O3 won't do with just function level attribute? – Desperado17 Dec 03 '18 at 12:06

0 Answers0