I wrote the following structure for use in an Arduino software PWM library I'm making, to PWM up to 20 pins at once (on an Uno) or 70 pins at once (on a Mega).
As written, the ISR portion of the code (eRCaGuy_SoftwarePWMupdate()), processing an array of this structure, takes 133us to run. VERY strangely, however, if I uncomment the line "byte flags1;" (in the struct), though flags1 is NOT used anywhere yet, the ISR now takes 158us to run. Then, if I uncomment "byte flags2;" so that BOTH flags are now uncommented, the runtime drops back down to where it was before (133us).
Why is this happening!? And how do I fix it? (ie: I want to ensure consistently fast code, for this particular function, not code that is inexplicably fickle). Adding one byte dramatically slows down the code, yet adding two makes no change at all.
I am trying to optimize the code (and I needed to add another feature too, requiring a single byte for flags), but I don't understand why adding one unused byte slows the code down by 25us, yet adding two unused bytes doesn't change the run-time at all.
I need to understand this to ensure my optimizations consistently work.
In .h file (my original struct, using C-style typedef'ed struct):
typedef struct softPWMpin //global struct
{
//VOLATILE VARIBLES (WILL BE ACCESSED IN AND OUTSIDE OF ISRs)
//for pin write access:
volatile byte pinBitMask;
volatile byte* volatile p_PORT_out; //pointer to port output register; NB: the 1st "volatile" says the port itself (1 byte) is volatile, the 2nd "volatile" says the *pointer* itself (2 bytes, pointing to the port) is volatile.
//for PWM output:
volatile unsigned long resolution;
volatile unsigned long PWMvalue; //Note: duty cycle = PWMvalue/(resolution - 1) = PWMvalue/topValue;
//ex: if resolution is 256, topValue is 255
//if PWMvalue = 255, duty_cycle = PWMvalue/topValue = 255/255 = 1 = 100%
//if PWMvalue = 50, duty_cycle = PWMvalue/topValue = 50/255 = 0.196 = 19.6%
//byte flags1;
//byte flags2;
//NON-VOLATILE VARIABLES (WILL ONLY BE ACCESSED INSIDE AN ISR, OR OUTSIDE AN ISR, BUT NOT BOTH)
unsigned long counter; //incremented each time update() is called; goes back to zero after reaching topValue; does NOT need to be volatile, since only the update function updates this (it is read-to or written from nowhere else)
} softPWMpin_t;
In .h file (new, using C++ style struct....to see if it makes any difference, per the comments. It appears to make no difference in any way, including run-time and compiled size)
struct softPWMpin //global struct
{
//VOLATILE VARIBLES (WILL BE ACCESSED IN AND OUTSIDE OF ISRs)
//for pin write access:
volatile byte pinBitMask;
volatile byte* volatile p_PORT_out; //pointer to port output register; NB: the 1st "volatile" says the port itself (1 byte) is volatile, the 2nd "volatile" says the *pointer* itself (2 bytes, pointing to the port) is volatile.
//for PWM output:
volatile unsigned long resolution;
volatile unsigned long PWMvalue; //Note: duty cycle = PWMvalue/(resolution - 1) = PWMvalue/topValue;
//ex: if resolution is 256, topValue is 255
//if PWMvalue = 255, duty_cycle = PWMvalue/topValue = 255/255 = 1 = 100%
//if PWMvalue = 50, duty_cycle = PWMvalue/topValue = 50/255 = 0.196 = 19.6%
//byte flags1;
//byte flags2;
//NON-VOLATILE VARIABLES (WILL ONLY BE ACCESSED INSIDE AN ISR, OR OUTSIDE AN ISR, BUT NOT BOTH)
unsigned long counter; //incremented each time update() is called; goes back to zero after reaching topValue; does NOT need to be volatile, since only the update function updates this (it is read-to or written from nowhere else)
};
In .cpp file (here I am creating the array of structs, and here is the update function which is called at a fixed rate in an ISR, via timer interrupts):
//static softPWMpin_t PWMpins[MAX_NUMBER_SOFTWARE_PWM_PINS]; //C-style, old, MAX_NUMBER_SOFTWARE_PWM_PINS = 20; static to give it file scope only
static softPWMpin PWMpins[MAX_NUMBER_SOFTWARE_PWM_PINS]; //C++-style, old, MAX_NUMBER_SOFTWARE_PWM_PINS = 20; static to give it file scope only
//This function must be placed within an ISR, to be called at a fixed interval
void eRCaGuy_SoftwarePWMupdate()
{
//Forced nonatomic block (ie: interrupts *enabled*)
byte SREG_old = SREG; //[1 clock cycle]
interrupts(); //[1 clock cycle] turn interrupts ON to allow *nested interrupts* (ex: handling of time-sensitive timing, such as reading incoming PWM signals or counting Timer2 overflows)
{
//first, increment all counters of attached pins (ie: where the value != PIN_NOT_ATTACHED)
//pinMapArray
for (byte pin=0; pin<NUM_DIGITAL_PINS; pin++)
{
byte i = pinMapArray[pin]; //[2 clock cycles: 0.125us]; No need to turn off interrupts to read this volatile variable here since reading pinMapArray[pin] is an atomic operation (since it's a single byte)
if (i != PIN_NOT_ATTACHED) //if the pin IS attached, increment counter and decide what to do with pin...
{
//Read volatile variables ONE time, all at once, to optimize code (volatile variables take more time to read [I know] since their values can't be recalled from registers [I believe]).
noInterrupts(); //[1 clock cycle] turn off interrupts to read non-atomic volatile variables that could be updated simultaneously right now in another ISR, since nested interrupts are enabled here
unsigned long resolution = PWMpins[i].resolution;
unsigned long PWMvalue = PWMpins[i].PWMvalue;
volatile byte* p_PORT_out = PWMpins[i].p_PORT_out; //[0.44us raw: 5 clock cycles, 0.3125us]
interrupts(); //[1 clock cycle]
//handle edge cases FIRST (PWMvalue==0 and PMWvalue==topValue), since if an edge case exists we should NOT do the main case handling below
if (PWMvalue==0) //the PWM command is 0% duty cycle
{
fastDigitalWrite(p_PORT_out,PWMpins[i].pinBitMask,LOW); //write LOW [1.19us raw: 17 clock cycles, 1.0625us]
}
else if (PWMvalue==resolution-1) //the PWM command is 100% duty cycle
{
fastDigitalWrite(p_PORT_out,PWMpins[i].pinBitMask,HIGH); //write HIGH [0.88us raw; 12 clock cycles, 0.75us]
}
//THEN handle main cases (PWMvalue is > 0 and < topValue)
else //(0% < PWM command < 100%)
{
PWMpins[i].counter++; //not volatile
if (PWMpins[i].counter >= resolution)
{
PWMpins[i].counter = 0; //reset
fastDigitalWrite(p_PORT_out,PWMpins[i].pinBitMask,HIGH);
}
else if (PWMpins[i].counter>=PWMvalue)
{
fastDigitalWrite(p_PORT_out,PWMpins[i].pinBitMask,LOW); //write LOW [1.18us raw: 17 clock cycles, 1.0625us]
}
}
}
}
}
SREG = SREG_old; //restore interrupt enable status
}
Update (5/4/2015, 8:58pm):
I've tried changing the alignment via the aligned attribute. My compiler is gcc.
Here's how I modified the struct in the .h file to add the attribute (it's on the very last line). Note that I also changed the order of the struct members to be largest to smallest:
struct softPWMpin //C++ style
{
volatile unsigned long resolution;
volatile unsigned long PWMvalue; //Note: duty cycle = PWMvalue/(resolution - 1) = PWMvalue/topValue;
//ex: if resolution is 256, topValue is 255
//if PWMvalue = 255, duty_cycle = PWMvalue/topValue = 255/255 = 1 = 100%
//if PWMvalue = 50, duty_cycle = PWMvalue/topValue = 50/255 = 0.196 = 19.6%
unsigned long counter; //incremented each time update() is called; goes back to zero after reaching topValue; does NOT need to be volatile, since only the update function updates this (it is read-to or written from nowhere else)
volatile byte* volatile p_PORT_out; //pointer to port output register; NB: the 1st "volatile" says the port itself (1 byte) is volatile, the 2nd "volatile" says the *pointer* itself (2 bytes, pointing to the port) is volatile.
volatile byte pinBitMask;
// byte flags1;
// byte flags2;
} __attribute__ ((aligned));
Source: https://gcc.gnu.org/onlinedocs/gcc-3.1/gcc/Type-Attributes.html
Here's the results of what I've tried so far:
__attribute__ ((aligned));
__attribute__ ((aligned(1)));
__attribute__ ((aligned(2)));
__attribute__ ((aligned(4)));
__attribute__ ((aligned(8)));
None of them seem to fix the problem I see when I add one flag byte. When leaving the flag bytes commented out the 2-8 ones make the run-time longer than 133us, and the align 1 one makes no difference (run-time stays 133us), implying that it is what is already occurring with the attribute not added at all. Additionally, even when I use the align options of 2, 4, 8, the sizeof(PWMvalue) function still returns the exact number of bytes in the struct, with no additional padding.
...still don't know what's going on...
Update, 11:02pm:
(see comments below) Optimization levels definitely have an effect. When I changed the compiler optimization level from -Os to -O2, for instance, the base case remained at 133us (as before), uncommenting flags1 gave me 120us (vs 158us), and uncommenting flags1 and flags2 simultaneously gave me 132us (vs 133us). This still doesn't answer my question, but I've at least learned that optimization levels exist, and how to change them.
Summary of above paragraph:
Processing time of (of eRCaGuy_SoftwarePWMupdate() function)
Optimization No flags w/flags1 w/flags1+flags2
Os 133us 158us 133us
O2 132us 120us 132us
Memory Use (bytes: flash/global vars SRAM/sizeof(softPWMpin)/sizeof(PWMpins))
Optimization No flags w/flags1 w/flags1+flags2
Os 4020/591/15/300 3950/611/16/320 4020/631/17/340
O2 4154/591/15/300 4064/611/16/320 4154/631/17/340
Update (5/5/2015, 4:05pm):
- Just updated the tables above with more detailed information.
- Added resources below.
Resources:
Sources for gcc compiler optimization levels:
- https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
- https://gcc.gnu.org/onlinedocs/gnat_ugn/Optimization-Levels.html
- http://www.rapidtables.com/code/linux/gcc/gcc-o.htm
How to change compiler settings in Arduino IDE:
- http://www.instructables.com/id/Arduino-IDE-16x-compiler-optimisations-faster-code/
Info on structure packing:
- http://www.catb.org/esr/structure-packing/
Data Alignment:
- http://www.songho.ca/misc/alignment/dataalign.html
Writing efficient C code for an 8-bit Atmel AVR Microcontroller
- AVR035 Efficient C Coding for AVR - doc1497 - http://www.atmel.com/images/doc1497.pdf
- AVR4027 Tips and Tricks to Optimize Your C Code for 8-bit AVR Microcontrollers - doc8453 - http://www.atmel.com/images/doc8453.pdf
Additional info that may prove useful to help you help me with my problem:
FOR NO FLAGS (flags1 and flags2 commented out), and Os Optimization
Build Preferences (from buildprefs.txt file where Arduino spits out compiled code):
For me: "C:\Users\Gabriel\AppData\Local\Temp\build8427371380606368699.tmp"
build.arch = AVR
build.board = AVR_UNO
build.core = arduino
build.core.path = C:\Program Files (x86)\Arduino\hardware\arduino\avr\cores\arduino
build.extra_flags =
build.f_cpu = 16000000L
build.mcu = atmega328p
build.path = C:\Users\Gabriel\AppData\Local\Temp\build8427371380606368699.tmp
build.project_name = software_PWM_fade13_speed_test2.cpp
build.system.path = C:\Program Files (x86)\Arduino\hardware\arduino\avr\system
build.usb_flags = -DUSB_VID={build.vid} -DUSB_PID={build.pid} '-DUSB_MANUFACTURER={build.usb_manufacturer}' '-DUSB_PRODUCT={build.usb_product}'
build.usb_manufacturer =
build.variant = standard
build.variant.path = C:\Program Files (x86)\Arduino\hardware\arduino\avr\variants\standard
build.verbose = true
build.warn_data_percentage = 75
compiler.S.extra_flags =
compiler.S.flags = -c -g -x assembler-with-cpp
compiler.ar.cmd = avr-ar
compiler.ar.extra_flags =
compiler.ar.flags = rcs
compiler.c.cmd = avr-gcc
compiler.c.elf.cmd = avr-gcc
compiler.c.elf.extra_flags =
compiler.c.elf.flags = -w -Os -Wl,--gc-sections
compiler.c.extra_flags =
compiler.c.flags = -c -g -Os -w -ffunction-sections -fdata-sections -MMD
compiler.cpp.cmd = avr-g++
compiler.cpp.extra_flags =
compiler.cpp.flags = -c -g -Os -w -fno-exceptions -ffunction-sections -fdata-sections -fno-threadsafe-statics -MMD
compiler.elf2hex.cmd = avr-objcopy
compiler.elf2hex.extra_flags =
compiler.elf2hex.flags = -O ihex -R .eeprom
compiler.ldflags =
compiler.objcopy.cmd = avr-objcopy
compiler.objcopy.eep.extra_flags =
compiler.objcopy.eep.flags = -O ihex -j .eeprom --set-section-flags=.eeprom=alloc,load --no-change-warnings --change-section-lma .eeprom=0
compiler.path = {runtime.ide.path}/hardware/tools/avr/bin/
compiler.size.cmd = avr-size
Some of the Assembly: (Os, no flags):
00000328 <_Z25eRCaGuy_SoftwarePWMupdatev>:
328: 8f 92 push r8
32a: 9f 92 push r9
32c: af 92 push r10
32e: bf 92 push r11
330: cf 92 push r12
332: df 92 push r13
334: ef 92 push r14
336: ff 92 push r15
338: 0f 93 push r16
33a: 1f 93 push r17
33c: cf 93 push r28
33e: df 93 push r29
340: 0f b7 in r16, 0x3f ; 63
342: 78 94 sei
344: 20 e0 ldi r18, 0x00 ; 0
346: 30 e0 ldi r19, 0x00 ; 0
348: 1f e0 ldi r17, 0x0F ; 15
34a: f9 01 movw r30, r18
34c: e8 5a subi r30, 0xA8 ; 168
34e: fe 4f sbci r31, 0xFE ; 254
350: 80 81 ld r24, Z
352: 8f 3f cpi r24, 0xFF ; 255
354: 09 f4 brne .+2 ; 0x358 <_Z25eRCaGuy_SoftwarePWMupdatev+0x30>
356: 67 c0 rjmp .+206 ; 0x426 <_Z25eRCaGuy_SoftwarePWMupdatev+0xfe>
358: f8 94 cli
35a: 90 e0 ldi r25, 0x00 ; 0
35c: 18 9f mul r17, r24
35e: f0 01 movw r30, r0
360: 19 9f mul r17, r25
362: f0 0d add r31, r0
364: 11 24 eor r1, r1
366: e4 59 subi r30, 0x94 ; 148
368: fe 4f sbci r31, 0xFE ; 254
36a: c0 80 ld r12, Z
36c: d1 80 ldd r13, Z+1 ; 0x01
36e: e2 80 ldd r14, Z+2 ; 0x02
370: f3 80 ldd r15, Z+3 ; 0x03
372: 44 81 ldd r20, Z+4 ; 0x04
374: 55 81 ldd r21, Z+5 ; 0x05
376: 66 81 ldd r22, Z+6 ; 0x06
378: 77 81 ldd r23, Z+7 ; 0x07
37a: 04 84 ldd r0, Z+12 ; 0x0c
37c: f5 85 ldd r31, Z+13 ; 0x0d
37e: e0 2d mov r30, r0
380: 78 94 sei
382: 41 15 cp r20, r1
384: 51 05 cpc r21, r1
386: 61 05 cpc r22, r1
388: 71 05 cpc r23, r1
38a: 51 f4 brne .+20 ; 0x3a0 <_Z25eRCaGuy_SoftwarePWMupdatev+0x78>
38c: 18 9f mul r17, r24
38e: d0 01 movw r26, r0
390: 19 9f mul r17, r25
392: b0 0d add r27, r0
394: 11 24 eor r1, r1
396: a4 59 subi r26, 0x94 ; 148
398: be 4f sbci r27, 0xFE ; 254
39a: 1e 96 adiw r26, 0x0e ; 14
39c: 4c 91 ld r20, X
39e: 3b c0 rjmp .+118 ; 0x416 <_Z25eRCaGuy_SoftwarePWMupdatev+0xee>
3a0: 46 01 movw r8, r12
3a2: 57 01 movw r10, r14
3a4: a1 e0 ldi r26, 0x01 ; 1
3a6: 8a 1a sub r8, r26
3a8: 91 08 sbc r9, r1
3aa: a1 08 sbc r10, r1
3ac: b1 08 sbc r11, r1
3ae: 48 15 cp r20, r8
3b0: 59 05 cpc r21, r9
3b2: 6a 05 cpc r22, r10
3b4: 7b 05 cpc r23, r11
3b6: 51 f4 brne .+20 ; 0x3cc <_Z25eRCaGuy_SoftwarePWMupdatev+0xa4>
3b8: 18 9f mul r17, r24
3ba: d0 01 movw r26, r0
3bc: 19 9f mul r17, r25
3be: b0 0d add r27, r0
3c0: 11 24 eor r1, r1
3c2: a4 59 subi r26, 0x94 ; 148
3c4: be 4f sbci r27, 0xFE ; 254
3c6: 1e 96 adiw r26, 0x0e ; 14
3c8: 9c 91 ld r25, X
3ca: 1c c0 rjmp .+56 ; 0x404 <_Z25eRCaGuy_SoftwarePWMupdatev+0xdc>
3cc: 18 9f mul r17, r24
3ce: e0 01 movw r28, r0
3d0: 19 9f mul r17, r25
3d2: d0 0d add r29, r0
3d4: 11 24 eor r1, r1
3d6: c4 59 subi r28, 0x94 ; 148
3d8: de 4f sbci r29, 0xFE ; 254
3da: 88 85 ldd r24, Y+8 ; 0x08
3dc: 99 85 ldd r25, Y+9 ; 0x09
3de: aa 85 ldd r26, Y+10 ; 0x0a
3e0: bb 85 ldd r27, Y+11 ; 0x0b
3e2: 01 96 adiw r24, 0x01 ; 1
3e4: a1 1d adc r26, r1
3e6: b1 1d adc r27, r1
3e8: 88 87 std Y+8, r24 ; 0x08
3ea: 99 87 std Y+9, r25 ; 0x09
3ec: aa 87 std Y+10, r26 ; 0x0a
3ee: bb 87 std Y+11, r27 ; 0x0b
3f0: 8c 15 cp r24, r12
3f2: 9d 05 cpc r25, r13
3f4: ae 05 cpc r26, r14
3f6: bf 05 cpc r27, r15
3f8: 40 f0 brcs .+16 ; 0x40a <_Z25eRCaGuy_SoftwarePWMupdatev+0xe2>
3fa: 18 86 std Y+8, r1 ; 0x08
3fc: 19 86 std Y+9, r1 ; 0x09
3fe: 1a 86 std Y+10, r1 ; 0x0a
400: 1b 86 std Y+11, r1 ; 0x0b
402: 9e 85 ldd r25, Y+14 ; 0x0e
404: 80 81 ld r24, Z
406: 89 2b or r24, r25
408: 0d c0 rjmp .+26 ; 0x424 <_Z25eRCaGuy_SoftwarePWMupdatev+0xfc>
40a: 84 17 cp r24, r20
40c: 95 07 cpc r25, r21
40e: a6 07 cpc r26, r22
410: b7 07 cpc r27, r23
412: 48 f0 brcs .+18 ; 0x426 <_Z25eRCaGuy_SoftwarePWMupdatev+0xfe>
414: 4e 85 ldd r20, Y+14 ; 0x0e
416: 80 81 ld r24, Z
418: 90 e0 ldi r25, 0x00 ; 0
41a: 50 e0 ldi r21, 0x00 ; 0
41c: 40 95 com r20
41e: 50 95 com r21
420: 84 23 and r24, r20
422: 95 23 and r25, r21
424: 80 83 st Z, r24
426: 2f 5f subi r18, 0xFF ; 255
428: 3f 4f sbci r19, 0xFF ; 255
42a: 24 31 cpi r18, 0x14 ; 20
42c: 31 05 cpc r19, r1
42e: 09 f0 breq .+2 ; 0x432 <_Z25eRCaGuy_SoftwarePWMupdatev+0x10a>
430: 8c cf rjmp .-232 ; 0x34a <_Z25eRCaGuy_SoftwarePWMupdatev+0x22>
432: 0f bf out 0x3f, r16 ; 63
434: df 91 pop r29
436: cf 91 pop r28
438: 1f 91 pop r17
43a: 0f 91 pop r16
43c: ff 90 pop r15
43e: ef 90 pop r14
440: df 90 pop r13
442: cf 90 pop r12
444: bf 90 pop r11
446: af 90 pop r10
448: 9f 90 pop r9
44a: 8f 90 pop r8
44c: 08 95 ret