I have a few place in my code that could really use some speed up, when I try to use CM4 SIMD instructions the result is always slower than scalar version, for example, this is an alpha blending function I'm using a lot, it's not really slow but it serves as an example:
for (int y=0; y<h; y++) {
i=y*w;
for (int x=0; x<w; x++) {
uint spix = *srcp++;
uint dpix = dstp[i+x];
uint r=(alpha*R565(spix)+(256-alpha)*R565(dpix))>>8;
uint g=(alpha*G565(spix)+(256-alpha)*G565(dpix))>>8;
uint b=(alpha*B565(spix)+(256-alpha)*B565(dpix))>>8;
dstp[i+x]= RGB565(r, g, b);
}
}
R565, G565, B565 and RGB565 are macros that extract and pack RGB565 respectively, please ignore
Now I tried using __SMUAD
and see if anything changes, the result was slower (or same speed as the original code) even tried loop unrolling, with no luck:
uint v0, vr, vg, vb;
v0 = (alpha<<16)|(256-alpha);
for (int y=0; y<h; y++) {
i=y*w;
for (int x=0; x<w; x++) {
spix = *srcp++;
dpix = dstp[i+x];
uint vr = R565(spix)<<16 | R565(dpix);
uint vg = G565(spix)<<16 | G565(dpix);
uint vb = B565(spix)<<16 | B565(dpix);
uint r = __SMUAD(v0, vr)>>8;
uint g = __SMUAD(v0, vg)>>8;
uint b = __SMUAD(v0, vb)>>8;
dstp[i+x]= RGB565(r, g, b);
}
}
I know this has been asked before, but given the architectural differences, and the fact that none of the answers really solve my problem, I'm asking again. Thanks!
Update
Scalar disassembly:
Disassembly of section .text.blend:
00000000 <blend>:
0: e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
4: 6846 ldr r6, [r0, #4]
6: 68c4 ldr r4, [r0, #12]
8: b086 sub sp, #24
a: 199e adds r6, r3, r6
c: 9601 str r6, [sp, #4]
e: 9200 str r2, [sp, #0]
10: 68ca ldr r2, [r1, #12]
12: f89d 5038 ldrb.w r5, [sp, #56] ; 0x38
16: 9204 str r2, [sp, #16]
18: 9a01 ldr r2, [sp, #4]
1a: 426e negs r6, r5
1c: 4293 cmp r3, r2
1e: b2f6 uxtb r6, r6
20: da5b bge.n da <blend+0xda>
22: 8809 ldrh r1, [r1, #0]
24: 6802 ldr r2, [r0, #0]
26: 9102 str r1, [sp, #8]
28: fb03 fb01 mul.w fp, r3, r1
2c: 9900 ldr r1, [sp, #0]
2e: 4411 add r1, r2
30: 9103 str r1, [sp, #12]
32: 0052 lsls r2, r2, #1
34: 9205 str r2, [sp, #20]
36: 9903 ldr r1, [sp, #12]
38: 9a00 ldr r2, [sp, #0]
3a: 428a cmp r2, r1
3c: fa1f fb8b uxth.w fp, fp
40: da49 bge.n d6 <blend+0xd6>
42: 4610 mov r0, r2
44: 4458 add r0, fp
46: f100 4000 add.w r0, r0, #2147483648 ; 0x80000000
4a: 9a04 ldr r2, [sp, #16]
4c: f8dd a014 ldr.w sl, [sp, #20]
50: 3801 subs r0, #1
52: eb02 0040 add.w r0, r2, r0, lsl #1
56: 44a2 add sl, r4
58: f834 1b02 ldrh.w r1, [r4], #2
5c: 8842 ldrh r2, [r0, #2]
5e: f3c1 07c4 ubfx r7, r1, #3, #5
62: f3c2 09c4 ubfx r9, r2, #3, #5
66: f001 0c07 and.w ip, r1, #7
6a: f3c1 2804 ubfx r8, r1, #8, #5
6e: fb07 f705 mul.w r7, r7, r5
72: 0b49 lsrs r1, r1, #13
74: fb06 7709 mla r7, r6, r9, r7
78: ea41 01cc orr.w r1, r1, ip, lsl #3
7c: f3c2 2904 ubfx r9, r2, #8, #5
80: f002 0c07 and.w ip, r2, #7
84: fb08 f805 mul.w r8, r8, r5
88: 0b52 lsrs r2, r2, #13
8a: fb01 f105 mul.w r1, r1, r5
8e: 097f lsrs r7, r7, #5
90: fb06 8809 mla r8, r6, r9, r8
94: ea42 02cc orr.w r2, r2, ip, lsl #3
98: fb06 1202 mla r2, r6, r2, r1
9c: f007 07f8 and.w r7, r7, #248 ; 0xf8
a0: f408 58f8 and.w r8, r8, #7936 ; 0x1f00
a4: 0a12 lsrs r2, r2, #8
a6: ea48 0107 orr.w r1, r8, r7
aa: ea41 3142 orr.w r1, r1, r2, lsl #13
ae: f3c2 02c2 ubfx r2, r2, #3, #3
b2: 430a orrs r2, r1
b4: 4554 cmp r4, sl
b6: f820 2f02 strh.w r2, [r0, #2]!
ba: d1cd bne.n 58 <blend+0x58>
bc: 9902 ldr r1, [sp, #8]
be: 448b add fp, r1
c0: 9901 ldr r1, [sp, #4]
c2: 3301 adds r3, #1
c4: 428b cmp r3, r1
c6: fa1f fb8b uxth.w fp, fp
ca: d006 beq.n da <blend+0xda>
cc: 9a00 ldr r2, [sp, #0]
ce: 9903 ldr r1, [sp, #12]
d0: 428a cmp r2, r1
d2: 4654 mov r4, sl
d4: dbb5 blt.n 42 <blend+0x42>
d6: 46a2 mov sl, r4
d8: e7f0 b.n bc <blend+0xbc>
da: b006 add sp, #24
dc: e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
e0: 4770 bx lr
e2: bf00 nop
SIMD disassembly:
sassembly of section .text.blend:
00000000 <blend>:
0: e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
4: 6846 ldr r6, [r0, #4]
6: 68c4 ldr r4, [r0, #12]
8: b086 sub sp, #24
a: 199e adds r6, r3, r6
c: 9601 str r6, [sp, #4]
e: 9200 str r2, [sp, #0]
10: 68ca ldr r2, [r1, #12]
12: f89d 5038 ldrb.w r5, [sp, #56] ; 0x38
16: 9204 str r2, [sp, #16]
18: 9a01 ldr r2, [sp, #4]
1a: f5c5 7680 rsb r6, r5, #256 ; 0x100
1e: 4293 cmp r3, r2
20: ea46 4505 orr.w r5, r6, r5, lsl #16
24: da5d bge.n e2 <blend+0xe2>
26: 8809 ldrh r1, [r1, #0]
28: 6802 ldr r2, [r0, #0]
2a: 9102 str r1, [sp, #8]
2c: fb03 fb01 mul.w fp, r3, r1
30: 9900 ldr r1, [sp, #0]
32: 4411 add r1, r2
34: 9103 str r1, [sp, #12]
36: 0052 lsls r2, r2, #1
38: 9205 str r2, [sp, #20]
3a: 9903 ldr r1, [sp, #12]
3c: 9a00 ldr r2, [sp, #0]
3e: 428a cmp r2, r1
40: fa1f fb8b uxth.w fp, fp
44: da4b bge.n de <blend+0xde>
46: 4610 mov r0, r2
48: 4458 add r0, fp
4a: f100 4000 add.w r0, r0, #2147483648 ; 0x80000000
4e: 9a04 ldr r2, [sp, #16]
50: f8dd a014 ldr.w sl, [sp, #20]
54: 3801 subs r0, #1
56: eb02 0040 add.w r0, r2, r0, lsl #1
5a: 44a2 add sl, r4
5c: f834 2b02 ldrh.w r2, [r4], #2
60: 8841 ldrh r1, [r0, #2]
62: f3c2 07c4 ubfx r7, r2, #3, #5
66: f3c1 06c4 ubfx r6, r1, #3, #5
6a: ea46 4707 orr.w r7, r6, r7, lsl #16
6e: fb25 f707 smuad r7, r5, r7
72: f001 0907 and.w r9, r1, #7
76: ea4f 3c51 mov.w ip, r1, lsr #13
7a: f002 0607 and.w r6, r2, #7
7e: ea4f 3852 mov.w r8, r2, lsr #13
82: ea4c 0cc9 orr.w ip, ip, r9, lsl #3
86: ea48 06c6 orr.w r6, r8, r6, lsl #3
8a: ea4c 4606 orr.w r6, ip, r6, lsl #16
8e: fb25 f606 smuad r6, r5, r6
92: f3c1 2104 ubfx r1, r1, #8, #5
96: f3c2 2204 ubfx r2, r2, #8, #5
9a: ea41 4202 orr.w r2, r1, r2, lsl #16
9e: fb25 f202 smuad r2, r5, r2
a2: f3c6 260f ubfx r6, r6, #8, #16
a6: 097f lsrs r7, r7, #5
a8: f3c6 01c2 ubfx r1, r6, #3, #3
ac: f007 07f8 and.w r7, r7, #248 ; 0xf8
b0: 430f orrs r7, r1
b2: f402 52f8 and.w r2, r2, #7936 ; 0x1f00
b6: ea47 3646 orr.w r6, r7, r6, lsl #13
ba: 4316 orrs r6, r2
bc: 4554 cmp r4, sl
be: f820 6f02 strh.w r6, [r0, #2]!
c2: d1cb bne.n 5c <blend+0x5c>
c4: 9902 ldr r1, [sp, #8]
c6: 448b add fp, r1
c8: 9901 ldr r1, [sp, #4]
ca: 3301 adds r3, #1
cc: 428b cmp r3, r1
ce: fa1f fb8b uxth.w fp, fp
d2: d006 beq.n e2 <blend+0xe2>
d4: 9a00 ldr r2, [sp, #0]
d6: 9903 ldr r1, [sp, #12]
d8: 428a cmp r2, r1
da: 4654 mov r4, sl
dc: dbb3 blt.n 46 <blend+0x46>
de: 46a2 mov sl, r4
e0: e7f0 b.n c4 <blend+0xc4>
e2: b006 add sp, #24
e4: e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
e8: 4770 bx lr
ea: bf00 nop