-1

I'm trying to implement RGB image to Gray image conversion, using C++ and arm neon especially inline assembly.

I write a run-ok-but-not-that-fast version with inline assembly. It compiles OK and run OK. For each pixel line processing, it can be described as Loop[AB].

I try to make Loop[AB] become A Loop[BA] B as a trick for optimization. However, it failed to compile, complains:

error: invalid symbol redefinition
            "L4End: \n"
             ^
<inline asm>:28:1: note: instantiated into assembly here
L4End: 
^
1 error generated.
template<int b_idx>
void rgb2gray_opt6(const unsigned char* rgb, int rgb_height, int rgb_width, int rgb_linebytes, unsigned char* gray, int gray_linebytes, const Option& opt)
{
    for (int i=0; i<rgb_height; i++)
    {
        const uchar* rgb_line = rgb + i*rgb_linebytes;
        uchar* gray_line = gray + i*gray_linebytes;

#if __ARM_NEON
        size_t nn = rgb_width >> 4;
        int remain = rgb_width - (nn << 4);
#else
        int remain = rgb_width;
#endif // __ARM_NEON

#if __ARM_NEON

#if 0  // this compiles OK, but may not that fast
        __asm__ volatile(
            "movi   v0.8b, #77                          \n"
            "movi   v1.8b, #151                         \n"
            "movi   v2.8b, #28                          \n"
            "0:                                         \n"
            "prfm  pldl1keep, [%[src], #512]            \n"
            "ld3    {v3.16b-v5.16b}, [%[src]], #48      \n"
            "ext    v6.16b, v3.16b, v3.16b, #8          \n"
            "ext    v7.16b, v4.16b, v4.16b, #8          \n"
            "ext    v8.16b, v5.16b, v5.16b, #8          \n"
            "umull  v9.8h,  v3.8b, v0.8b                \n"
            "umull  v10.8h, v6.8b, v0.8b                \n"
            "umull  v11.8h, v4.8b, v1.8b                \n"
            "umlal  v9.8h,  v5.8b, v2.8b                \n"
            "umull  v12.8h, v7.8b, v1.8b                \n"
            "umlal  v10.8h, v8.8b, v2.8b                \n"
            "addhn  v13.8b,  v9.8h,  v11.8h             \n"
            "addhn2 v13.16b, v10.8h, v12.8h             \n"
            "subs    %[neonLen], %[neonLen], #1         \n"
            "st1    {v13.16b}, [%[dst]], #16            \n"
            "bne        0b                              \n"

            :[src]        "+r"(rgb_line),
            [dst]        "+r"(gray_line),
            [neonLen]    "+r"(nn)
            :
            :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
        );

#else // this is for fast speed purpose, it make Loop[AB] becomes A Loop[BA] B, but failed to compile

        asm volatile(
            "movi v0.8b, #77 \n"
            "movi v1.8b, #151 \n"
            "movi v2.8b, #28 \n"
            "prfm pldl1keep, [%0, #512] \n"
            "ld3  {v3.16b-v5.16b}, [%0], #48 \n"
            "ext  v6.16b, v3.16b, v3.16b, #8 \n"
            "ext  v7.16b, v4.16b, v4.16b, #8 \n"
            "ext  v8.16b, v5.16b, v5.16b, #8 \n"
            "subs %w2, %w2, #1 \n"
            "ble L4End \n"                      //!!! This line compile error
            "0: \n"
            "prfm  pldl1keep, [%0, #512] \n"
            "ld3   {v3.16b-v5.16b}, [%0], #48 \n"
            "ext   v6.16b, v3.16b, v3.16b, #8 \n"
            "ext   v7.16b, v4.16b, v4.16b, #8 \n"
            "ext   v8.16b, v5.16b, v5.16b, #8  \n"
            "umull v9.8h,  v3.8b, v0.8b \n"
            "umull v10.8h, v6.8b, v0.8b  \n"
            "umull v11.8h, v4.8b, v1.8b  \n"
            "umlal v9.8h,  v5.8b, v2.8b  \n"
            "umull v12.8h, v7.8b, v1.8b  \n"
            "umlal v10.8h, v8.8b, v2.8b  \n"
            "addhn v13.8b,  v9.8h,  v11.8h \n"
            "addhn2 v13.16b, v10.8h, v12.8h \n"
            "subs %w2, %w2, #1 \n"
            "st1 {v13.16b}, [%1], #16 \n"
            "bne L4Loop \n"
            "L4End: \n"
            "umull  v9.8h,  v3.8b, v0.8b \n"
            "umull  v10.8h, v6.8b, v0.8b \n"
            "umull  v11.8h, v4.8b, v1.8b \n"
            "umlal  v9.8h,  v5.8b, v2.8b \n"
            "umull  v12.8h, v7.8b, v1.8b \n"
            "umlal  v10.8h, v8.8b, v2.8b \n"
            "addhn  v13.8b,  v9.8h,  v11.8h \n"
            "addhn2 v13.16b, v10.8h, v12.8h \n"
            "st1    {v13.16b}, [%1], #16 \n"
            : "=r"(rgb_line), // %0
              "=r"(gray_line), // %1
              "=r"(nn) // %2
            : "0"(rgb_line),
              "1"(gray_line),
              "2"(nn)
            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
        );
#endif // if 0

#endif // __ARM_NEON

        for (int j=rgb_width-remain; j<remain; j++)
        {
            int src_idx = i*rgb_linebytes + j*3;
            int r = rgb[src_idx + 2 - b_idx];
            int g = rgb[src_idx + 1];
            int b = rgb[src_idx + b_idx];

            int dst_idx = i*gray_linebytes + j;
            gray[dst_idx] = (77*r + 151*g + 28*b) >> 8;
        }
    }
}

Environment I build this program with Android NDK-r21b, which is bundled with a fairly new version of clang.

Note This question is about, how to resolve a compile error related to inline assembly, instead of how to optimize a function. The comments that talking about "bad optimization idea" is not related to the purpose of this question.


Update

Since this question is labeled as duplicate, I put my answer here, based on the comments (The only answer's content is still not verified, if have time will try).

  1. Is string literal labels can be used in inline assembly?

Some function may, some not. In my pasted code, template<int b_idx> cause inline and duplicated label issue. Reomving template parameters resolve it.

  1. Is digital (local) labels ok?

Actually I tried digital labels first but don't know the difference between b and f. They means backward and forward. After knowing that, can easily correct error and compile ok.

  1. Still using literal labels?

Use some_label%= instead of some_label, %= makes it unique.

ChrisZZ
  • 1,521
  • 2
  • 17
  • 24
  • 5
    Strong recommendation: Do not use inline assembly for this unless you know exactly what you are doing. And if you have to, read the whole manual on the subject before beginning. The manual even has an explanation of the exact problem you have experienced and how to work around it (use `%=` to make labels unique). You will have a much better time just using regular old assembly. – fuz Aug 05 '21 at 16:15
  • You shouldn't be using `v8` to `v15` unless you absolutely have to. Read `aapcs` first. And I completely agree with fuz: why bother with inline assembly? – Jake 'Alquimista' LEE Aug 05 '21 at 16:35
  • 1
    [Why Premature Optimization Is the Root of All Evil](https://stackify.com/premature-optimization-evil/) – Superlokkus Aug 05 '21 at 16:42
  • 1
    You MAY NOT use standard labels in inline assembly. – Jake 'Alquimista' LEE Aug 05 '21 at 17:04
  • IMHO, the best process is to write the function in C++, then set the optimizations to high and print the assembly language for the function. The next step is to create a separate assembly language module for the function (based on the assembly language from the compiler). You can then tweak the compiler's generated assembly language to optimize it. Before you print the assembly language, you should write the C++ code to help the compiler with optimizations, such as using `const` and local variables. – Thomas Matthews Aug 05 '21 at 17:23
  • I've used my process described above and quarreled with the compiler. Profiling all the code (with test points and oscilloscopes), showed that the compiler's code generation was as optimal as my best efforts. For example, I was trying to make the compiler use the ARM `stm` and `ldm` instructions (compiler chose other instructions). – Thomas Matthews Aug 05 '21 at 17:26
  • @ThomasMatthews You might have a valid point. However, it doesn't apply to neon codes. BOTH `GCC` and `Clang` generate FUBAR machine codes out of intrinsics, especially when permutations are involved. And the auto vectorization is simply unusable. – Jake 'Alquimista' LEE Aug 05 '21 at 18:07
  • @ThomasMatthews And even for pure ARM codes, I have yet to see a compiler generating codes consisting of DSP instructions such as `uqsub`, `sel`, and `uxtb16`. They make a HUGE difference. – Jake 'Alquimista' LEE Aug 05 '21 at 18:10
  • @ThomasMatthews I think you just didn't see the whole code that I pasted. In the last 10 lines, there is the C/C++ version of the expected result for this function. That's the so called naive C/C++ implementation, and I am not only putting inline assembly in a C++ function. – ChrisZZ Aug 06 '21 at 00:51
  • Re: your update: the label part was already answered by the first comment, and is a duplicate of [Labels in GCC inline assembly](https://stackoverflow.com/q/3898435). If the compiler inlines your function at more than one place in the same compilation unit, the assembler will see a duplicate label because you just used a bare name. You wouldn't have this problem with a stand-alone function, so unless you're optimizing for a very tiny iteration count, in practice you should just use Jake's optimized version. – Peter Cordes Aug 06 '21 at 00:52
  • @ThomasMatthews Acutally, the pasted code is based on NDK Clang generated assembly, instead of just manually write some without inspecting what the compiler generates. If people just put what compiler generated optimized code, why don't they just use C++ version? Putting C++ compiler generated optimized assembly into a separate .S file won't boost the speed, and for android and iOS platform, you have to face different calling stuffs, but inline assembly is with unique and simple syntax. – ChrisZZ Aug 06 '21 at 00:54
  • @PeterCordes I was confused that, even if I switch to Debug build type, clean all previous build stuffs, re-compile, but still encounter that compile error. I though inline optimization shouldn't be enabled if using Debug mode (I'm using CMake) – ChrisZZ Aug 06 '21 at 00:58
  • @ChrisZZ: Placing assembly language in a separate file has nothing to do with improving execution speed. I find that I can avoid the inline syntax by throwing the assembly code into a separate file. Also, as a separate file, I can experiment with it without affecting the C++ file. In summary, there are several issues: 1)getting the inline assembly syntax correct; 2) Optimizing the assembly language; and 3) Implementing the C++ code to get the compiler to emit the assembly language you are desiring. Personally, I have great difficulty getting the syntax correct for inline assembly. – Thomas Matthews Aug 06 '21 at 00:59
  • 1
    It's in a `template` function, which may be forced to expand multiple times for different template params. Look at the actual compiler asm output file (which the assembler complained about) if you're curious. e.g. on https://godbolt.org/ or with `g++ -S` – Peter Cordes Aug 06 '21 at 01:20
  • @ThomasMatthews: For inline asm you left out (4) *accurately* describe the asm to the compiler with input/output/clobber constraints. Otherwise you can be leaving land-mines of undefined behaviour that will explode with future compiler versions or changes to surrounding code (e.g. that it inlines into, or even in parent functions if wrong constraints result in functions that violate the ABI). This is not something you can get right by trial and error, or verify by testing, and is the hardest / most dangerous part of using inline asm. https://stackoverflow.com/tags/inline-assembly/info – Peter Cordes Aug 06 '21 at 01:25
  • (It's usually not that hard to get the constraints right if you fully understand asm, and how compilers "think about" registers and how GNU C inline asm is designed in the first place, but early-clobber can be tricky. Looks like the code in this question is mostly getting things right, except for no early-clobber on the dummy outputs.) Anyway agreed that a separate `.S` file won't make anything faster; if anything maybe slightly slower or the same speed. But if your asm is a long-running loop, that's negligible. – Peter Cordes Aug 06 '21 at 01:28

1 Answers1

0
    .arch   armv8-a
    .global alqRGB2Grey
    .text

.balign 64
.func
// void alqRGB2Grey(uint8_t *pDst, uint8_t *pSrc, uint32_t length);

pDst    .req    x0
pSrc    .req    x1
length  .req    x2

rcoeff  .req    v29
gcoeff  .req    v30
bcoeff  .req    v31

alqRGB2Grey:
    movi    rcoeff.16b, #77
    sub     length, length, #32
    movi    gcoeff.16b, #151
    movi    bcoeff.16b, #28
    b       1f

.balign 64
1:
    ld3     {v0.16b, v1.16b, v2.16b}, [pSrc], #48
    subs    length, length, #32
    ld3     {v3.16b, v4.16b, v5.16b}, [pSrc], #48

    umull   v16.8h, v0.8b, rcoeff.8b
    umull2  v17.8h, v0.16b, rcoeff.16b
    umull   v18.8h, v3.8b, rcoeff.8b
    umull2  v19.8h, v3.16b, rcoeff.16b

    umlal   v16.8h, v1.8b, gcoeff.8b
    umlal2  v17.8h, v1.16b, gcoeff.16b
    umlal   v18.8h, v4.8b, gcoeff.8b
    umlal2  v19.8h, v4.16b, gcoeff.16b

    umlal   v16.8h, v2.8b, bcoeff.8b
    umlal2  v17.8h, v2.16b, bcoeff.16b
    umlal   v18.8h, v5.8b, bcoeff.8b
    umlal2  v19.8h, v5.16b, bcoeff.16b

    rshrn   v20.8b, v16.8h, #8
    rshrn2  v20.16b, v17.8h, #8
    rshrn   v21.8b, v18.8h, #8
    rshrn2  v21.16b, v19.8h, #8

    stp     q20, q21, [pDst], #32
    b.pl    1b

    add     pSrc, pSrc, length, lsl #1
    cmp     length, #-32
    add     pSrc, pSrc, length
    add     pDst, pDst, length
    b.gt    1b

    ret
.endfunc
.end

Above is the function you were trying to write in good old assembly, and as you can see, it's much more readable compared to the inline assembly version.
The .req directive helps immensely.

Things to take notice of:

  • You don't need ext
  • You don't need addhn
  • length has to be >=32 (put an assert)
  • You can get rid of the conditional branch inside the loop by pre-decrementing length prior to the loop, changing the loop branch to b.pl, and 'rewinding' the pointers outside of the loop.
  • no manual cache preloading: it hurts the performance on systems with automatic cache preloading enabled which is the majority.
Jake 'Alquimista' LEE
  • 6,197
  • 2
  • 17
  • 25