0

So this is an ARM program called armkey reads a line of ASCII text with printable characters and control characters (00h-7Fh) from the file key.in into an input string. The read string ARM SWI will remove any end of line indication or characters and replace them with a single binary 0. If there are no more lines the read string ARM SWI will return a count of zero for the number of bytes read.

My code works but can someone please suggest ways to make this more efficient? for example I was thinking of using ORR and AND for the compares but i cannot seem to make it work and keep running into an infinite loop. any help is appreciated.

    ;------------------------------------------------;
    ; File:     armkey.s
    ;
    ; Function: This program reads a line of ASCII text
    ;           from a file and formats the output into 
    ;           key.out 
    ;
    ; Author:
    ;
    ; Changes:  Date        Reason
    ;           -------------------------------------
    ;           04/05/2018  Original Version
    ;----------------------------------------------------

    ;----------------------------------
    ; Software Interrupt values
    ;----------------------------------      
            .equ SWI_Open,  0x66     ;Open  a file
            .equ SWI_Close, 0x68     ;Close a file
           .equ SWI_PrStr, 0x69     ;Write a null-ending string
           .equ SWI_RdStr, 0x6a     ;Read a string and terminate with null char
           .equ SWI_Exit,  0x11     ;Stop execution
    ;----------------------------------

             .global   _start
             .text

    _start:
    ;----------------------------------
    ; open input file
    ; - r0 points to the file name
    ; - r1 0 for input
    ; - the open swi is 66h
    ; - after the open r0 will have the file handle
    ;----------------------------------
             ldr  r0, =InFileName     ;r0 points to the file name
             ldr  r1, =0              ;r1 = 0 specifies the file is input
             swi  SWI_Open            ;open the file ... r0 will be the file 
    handle
             ldr  r1, =InFileHandle   ;r1 points to handle location
             str  r0, [r1]            ;store the file handle
    ;----------------------------------


    ;----------------------------------
    ; open output file
    ; - r0 points to the file name
    ; - r1 1 for output
    ; - the open swi is 66h
    ; - after the open r0 will have the file handle
    ;---------------------------------- 
             ldr  r0, =OutFileName    ;r0 points to the file name
             ldr  r1, =1              ;r1 = 1 specifies the file is output
             swi  SWI_Open            ;open the file ... r0 will be the file 
    handle
             ldr  r1, =OutFileHandle  ;r1 points to handle location
             str  r0, [r1]            ;store the file handle
    ;----------------------------------


    ;----------------------------------
    ; read a string from the input file
    ; - r0 contains the file handle
    ; - r1 points to the input string buffer
    ; - r2 contains the max number of characters to read
    ; - the read swi is 6ah
    ; - the input string will be terminated with 0  
;----------------------------------                          ;
_read:       
         ldr  r0, =InFileHandle   ;r0 points to the input file handle
         ldr  r0, [r0]            ;r0 has the input file handle
         ldr  r1, =InString         ;r1 points to the input string
         ldr  r2, =80            ;r2 has the max size of the input string
         swi  SWI_RdStr           ;read a string from the input file
         cmp  r0,#0               ;no characters read means EOF
         beq  _exit               ;so close and exit
;----------------------------------
;// Implement key here

;----------------------------------
; Move the input string to the output string
; This code uses post increment of the input pointer,
; but not for the output pointer ... just to show both techniques
;----------------------------------
         ldr  r0, =InString       ;r0 points to the input  string
         ldr  r1, =OutString      ;r1 points to the output string
_loop:                            ;

        ldrb r2, [r0], #1        ;get the next input byte


        cmp   r2, #0x20   ; Was the character a space

        beq   _output             ; Print it
        cmp   r2, #0x00             ; Is the character a 0 
        beq  _output                ;Output it
_lower: 


        cmp   r2, #0x41           ; Check if the charactet is nor a letter
        blt     _loop             ; Throw out the character if nor a letter
        cmp   r2, #0x5a           ; Every letter will be less than Z
        ble   _output             ; If the character is valid output it
        sub   r2, r2, #0x20       ; Subtract ing 20 seitches an upper case letter to lower
        b     _lower              ; Check to see if lowercase letter



_output:        
         strb r2, [r1]            ;store it in the output buffer
         cmp  r2, #0x00             ;was it the null terminator
         beq  _finloop            ;yes ... exit
         add  r1, r1, #1          ;no  ... advance the output pointer
         b    _loop               ;loop
_finloop:                         ;
;----------------------------------


;----------------------------------
; Write the outputs string
; Then writes a CR LF pair
;----------------------------------

         ldr  r0, =OutFileHandle  ;r0 points to the output file handle
         ldr  r0, [r0]            ;r0 has the output file handle
         ldr  r1, =OutString      ;r1 points to the output string
         swi  SWI_PrStr           ;write the null terminated string

         ldr  r1, =CRLF           ;r1 points to the CRLF string
         swi  SWI_PrStr           ;write the null terminated string
         bal  _read               ;read the next line
;----------------------------------




;----------------------------------
; Close input and output files
; Terminate the program
;----------------------------------
_exit:                            ;
         ldr  r0, =InFileHandle   ;r0 points to the input  file handle
         ldr  r0, [r0]            ;r0 has the input file handle
         swi  SWI_Close           ;close the file
                                  ;
         ldr  r0, =OutFileHandle  ;r0 points to the output file handle
         ldr  r0, [r0]            ;r0 has the output file handle
         swi  SWI_Close           ;close the file
                                  ;
         swi  SWI_Exit            ;terminate the program
;----------------------------------


         .data
;----------------------------------
InFileHandle:  .skip 4            ;4 byte field to hold the input  file handle
OutFileHandle: .skip 4            ;4 byte field to hold the output file handle
                                  ;
InFileName:    .asciz "KEY.IN"   ;Input  file name, null terminated
                                  ;
InString:      .skip 128         ;reserve a 128 byte string for input
OutString:     .skip 128         ;reserve a 128 byte string for output
                                  ;
CRLF:          .byte 13, 10, 0    ;CR LF
                                  ;
OutFileName:   .asciz "KEY.OUT"  ;Output file name, null terminated
;----------------------------------


         .end

and the output is

ABCDEFGHIJKLMNOPQRSTUVWXYZ 
ABCDEFGHIJKLMNOPQRSTUVWXYZ

1 Answers1

1

You're going to spend most of your time on I/O in this code, but we can imagine the buffers were really large so the memory copying was non-trivial compared to the overhead of an SWI.

First of all, you might be able to do the left-packing (filtering an array based on a compare) with NEON SIMD instructions. But ARM is missing some nice x86 features that make left-packing efficient with SSSE3 or AVX2. (e.g. pmovmskb to turn a vector compare result into a bitmask in an integer register, where you can use it as a table index to look up a shuffle mask. And popcnt it to see how far to advance your output pointer.) I'm not sure how to even efficiently implement strchr with NEON :/

Still, major speedups could be possible this way, if you can process more than one byte at a time. Branchless is nice, too, to avoid branch mispredicts.


You don't need a separate output buffer; you can filter your array in-place. Search for the first ' ', then run src and dst pointers within the same buffer. The src always stays ahead of dst, getting farther ahead for every character you skip. But not so far apart that it's not still hot in cache when you store, so you avoid all the read-for-ownership traffic of storing to cold cache lines. And the total amount of memory you touch is about half, so you evict less data from cache.


Rearrange your loops so a conditional branch is at the bottom, and there's no b back to the top. Sometimes this requires skewing the loop so you have to peel part of the last iteration and repeat some of the loop body after the loop, and you have to do some setup before falling into the first iteration, or jump into the middle of the loop on entry.


ldr r0, =InFileHandle: Use a call-preserved register to keep a pointer to your data area in a register across swi instructions. (save/restore it with push/pop at the start/end of your function). Then instead of needing to construct each pointer separately, you can just use loads with different offsets for different data items. e.g. ldr r0, [r4, #InFileHandle-Base] if that's the right syntax.

Or for example, in your current code with separate input/output buffers, you have

     ldr  r0, =InString       ;r0 points to the input  string
     ldr  r1, =OutString      ;r1 points to the output string

You could replace the 2nd instruction with add r1, r0, #128, which is cheaper (an ALU instruction instead of a PC-relative load from a literal pool, or however the assembler decides to construct a constant for you.)

Or better, save/restore a couple registers so you can keep both file descriptors in registers instead of reserving any static storage space for them.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847