0

Down below is my code from a MIPS hw assignment where we have to multiply two matrixes. Our task was to implement the matrix_multiply function and matrix_print function

.data

matrix_a:    .word 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
matrix_b:    .word 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
result:      .word 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0

newline:     .asciiz "\n"
tab:         .asciiz "\t"


#############################################################################
#############################################################################
## Text segment
#############################################################################
#############################################################################

.text                  # this is program code
.align 2               # instructions must be on word boundaries
.globl main            # main is a global label
.globl multiply
.globl matrix_multiply
.globl matrix_print
.globl matrix_ask

#############################################################################
matrix_ask:
#############################################################################
# Ask the user for the current matrix residing in the $a0 register
    sub $sp, $sp, 4
    sw $ra, 0($sp)

    # init our counter
    li $t0, 0
    # t1 holds our the address of our matrix
    move $t1, $a0

ma_head:
# if counter less than 16, go to ma_body
# else go to exit
    li $t2, 16
    blt $t0, $t2, ma_body
    j ma_exit

ma_body:
    # read int
    li $v0, 5
    syscall
    li $t2, 4
    # ints are 4 bytes
    multu $t0, $t2
    mflo $t2
    add $t2, $t2, $t1
    sw $v0, 0($t2)
    j ma_latch

ma_latch:
    addi $t0, $t0, 1
    j ma_head

ma_exit:
    lw $ra, 0($sp)
    add $sp, $sp, 4
    jr $ra

#############################################################################
main:
#############################################################################
    # alloc stack and store $ra
    sub $sp, $sp, 4
    sw $ra, 0($sp)

    # load A, B, and result into arg regs
    la $a0, matrix_a
    jal matrix_ask
    la $a0, matrix_b
    jal matrix_ask

    la $a0, matrix_a
    la $a1, matrix_b
    la $a2, result
    jal matrix_multiply

    la $a0, result
    jal matrix_print

    # restore $ra, free stack and return
    lw $ra, 0($sp)
    add $sp, $sp, 4
    jr $ra

##############################################################################
multiply:
##############################################################################  
# mult subroutine $a0 times $a1 and returns in $v0

    # start with $t0 = 0
    add $t0,$zero,$zero
mult_loop:
    # loop on a1
    beq $a1,$zero,mult_eol

    add $t0,$t0,$a0
    sub $a1,$a1,1
    j mult_loop

mult_eol:
    # put the result in $v0
    add $v0,$t0,$zero

    jr $ra

##############################################################################
matrix_multiply: 
##############################################################################
# mult matrices A and B together of square size N and store in result.

    # alloc stack and store regs
    sub $sp, $sp, 24
    sw $ra, 0($sp)
    sw $a0, 4($sp)
    sw $a1, 8($sp)
    sw $s0, 12($sp)
    sw $s1, 16($sp)
    sw $s2, 20($sp)


    add $t5, $zero, $zero   #Set t5 to zero. i = 0
    add $t6, $zero, $zero   #Set t6 to zero. j = 0
    add $t7, $zero, $zero   #Set t7 to zero. k = 0


        #setup for i loop
    iLoop:
        beq $t5, 4, exit
        j jLoop

        #setup for j loop
    jLoop:
        beq $t6, 4, iEnd
        #setup for k loop

    kLoop:
        beq $t7, 4, jEnd

        # compute A[i][k] address and load into $t3
        # A[i][k] = A+4*((4*i) + k)
        sll $t3, $t5, 2         # Store 4*i in $t3
        addu $t3, $t3, $t7       # Adds $t3 to k
        sll $t8, $t3, 2         # Computes 4*($t3) = 4*(4*i+k) by temporarily storing the product in $t8
        move $t3, $t8           # Stores 4*($t3) into $t3
        addu $t3, $t3, $a0       # Adds A to $t3
        lw $t3 0($t3)

        # compute B[k][j] address and load into $t4
        # B[k][j] = B+4*((4*k) + j)
        sll $t4, $t7, 2         # Stores 4*k in $t3
        addu $t4, $t4, $t6       # Adds $t4 to j
        sll $t8, $t4, 2         # Computes 4*($t4) = 4*(4*k+j) by temporarily storing the product in $t8
        move $t4, $t8           # Stores 4*($t4) into $t4
        addu $t4, $t4, $a1       # Adds B to $t4
        lw $t4 0($t4)

        # call the multiply function

        multu $t3, $t4
        mflo $t9

        # compute RESULT[i][j] address and load into $t1
        # RESULT[i][j] = RESULT+4*((4*i) + j)
        sll $t1, $t5, 2         # Store 4*i in $t1
        addu $t1, $t1, $t6       # Adds $t1 to j
        sll $t8, $t1, 2         # Computes 4*($t1) = 4*(4*i+k) by temporarily storing the product in $t8
        move $t1, $t8           # Stores 4*($t1) into $t1
        addu $t1, $t1, $a2       # Adds A to $t3
        lw $t1 0($t1)

        sw $a2 0($t9)           # Store $t9 into its respective location in $a2

        # increment k and jump back or exit
        addi $t7, $t7, 1
        j kLoop

    jEnd:
        #increment j and jump back or exit
        addi $t6, $t6, 1
        li $t7, 0
        j jLoop


    iEnd:
        #increment i and jump back or exit
        addi $t5, $t5, 1
        li $t6, 0
        j iLoop

    exit:
        # retore saved regs from stack
        lw $s2, 20($sp)
        lw $s1, 16($sp)
        lw $s0, 12($sp)
        lw $a1, 8($sp)
        lw $a0, 4($sp)
        lw $ra, 0($sp)

        # free stack and return
        add $sp, $sp, 24
        jr $ra
##############################################################################
matrix_print:
##############################################################################

    # alloc stack and store regs.
    sub $sp, $sp, 16
    sw $ra, 0($sp)
    sw $s0, 4($sp)
    sw $s1, 8($sp)
    sw $a0, 12($sp)

    li $t0, 4 # size of array

    # do your two loops here

    # setup to jump back and return

    lw $ra, 0($sp)
    lw $s0, 4($sp)
    lw $s1, 8($sp)
    lw $a0, 12($sp)
    add $sp, $sp, 16
    jr $ra

I keep getting one of the two following errors - Either I get an error saying Exception occurred at PC=0x004000cc, Arithmetic overflow or "The Following Symbols are Undefined" and it lists iEnd, matrix_print, exit, and jEnd. I don't know why those symbols would be "undefined since they're clearly written in my code, and they don't have to be "global" symbols, otherwise matrix_print wouldn't have been included in that list.

I realize I haven't yet implemented my matrix_print, but I'll get to that once I figure out what's causing the more pressing issue, which is my Arithmetic Overflow error. I use sll knowing I need to multiply by 4, and sll is the best way to approach it. I don't believe I'm going out of bounds anywhere either, so if someone can please help me debug this I would greatly appreciate it. Thanks!

danielschnoll
  • 3,045
  • 5
  • 23
  • 34
  • Well, look at which instruction is at address `0x004000cc`. Also, use the debugger to single step the program from the start. – Jester Oct 13 '17 at 02:09
  • 1
    You're probably getting overflow from using `add` (trap on overflow) instead of `addu` (non-trapping). *use a multiply instruction to multiply integers instead of that horrible slow loop*. – Peter Cordes Oct 13 '17 at 02:23
  • @PeterCordes I changed my `add` to an `addu`, but I'm confused what you mean by `use a multiply instruction to multiply integers instead of that horrible slow loop`. I assume you're referring to the given `multiply` function. I was told by a classmate to ignore it because it doesn't work right. I have updated my post to reflect what code I currently have. – danielschnoll Oct 13 '17 at 02:47
  • @PeterCordes If the slow loop you're referring to is the O(n^3) MIPS equivalent of a for loop, its the structure we're expected to have. One i loop, one j loop, and one k loop. We're learning the introduction of MIPS, stack pointers, and other calls... It sucks but its what we have to work with – danielschnoll Oct 13 '17 at 02:48
  • 1
    I was talking about the `multiply:` / `mult_loop:` that I commented on on your other question. That's the loop that you can replace with a single multiply instruction. It's normal for matmul to be O(n^3) work, that's fine. The main optimization there is data access patterns to reuse data while it's hot in cache, and that's non-trivial. See http://wgropp.cs.illinois.edu/courses/cs598-s16/lectures/lecture11.pdf for example. Also https://stackoverflow.com/questions/1303182/how-does-blas-get-such-extreme-performance and https://stackoverflow.com/questions/46552143/ – Peter Cordes Oct 13 '17 at 02:56

0 Answers0