0

Im attempting to create an assembler in c which just reads the instructions from an input file and then translates them to their machine/hex code. So far so good until i hit the jump instruction, while i have it so that it translates the hex properly to in the case of jump 24 to c0 00 00 18 im unsure of how to check those 00's properly without hard coding it in. How can i do this? Should i be using shifts?

PR1.c

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

char *ltrim(char *s) {
    while (*s == ' ' || *s == '\t') s++;
    return s;
}

char getRegister(char *text) {
    if (*text == 'r' || *text=='R') text++;
    return atoi(text);
}

char getHex(char *text){
    int number = (int)strtol(text, NULL, 16);
    char reservednum[5];
    if(number <= 0xFFFF){
        sprintf(&reservednum[0], "%04x", number);
    }
    return atoi(reservednum);
}

int assembleLine(char *text, unsigned char* bytes) {
    text = ltrim(text);
    char *keyWord = strtok(text," ");
    if (strcmp("add",keyWord) == 0) {
        bytes[0] = 0x10;
        bytes[0] |= getRegister(strtok(NULL," "));
        bytes[1] = getRegister(strtok(NULL," ")) << 4 | getRegister(strtok(NULL," "));
        return 2;
    }
    else if(strcmp("subtract", keyWord) == 0){
        bytes[0] = 0x50;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getRegister(strtok(NULL, " ")) << 4 | getRegister(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("and", keyWord) == 0){
        bytes[0] = 0x20;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getRegister(strtok(NULL, " ")) << 4 | getRegister(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("divide", keyWord) == 0){
        bytes[0] = 0x30;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getRegister(strtok(NULL, " ")) << 4 | getRegister(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("multiply", keyWord) == 0){
        bytes[0] = 0x40;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getRegister(strtok(NULL, " ")) << 4 | getRegister(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("or", keyWord) == 0){
        bytes[0] = 0x60;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getRegister(strtok(NULL, " ")) << 4 | getRegister(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("halt", keyWord) == 0){
        bytes[0] = 0x00;
        bytes[1] = 0x00;
        return 2;
    }
    else if(strcmp("return", keyWord) == 0){
        bytes[0] = 0x70;
        bytes[1] = 0x00;
        return 2;
    }
    else if(strcmp("addimmediate", keyWord) == 0){
        bytes[0] = 0x90;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getHex(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("interrupt", keyWord) == 0){
        bytes[0] = 0x80;
        bytes[0] |= getHex(strtok(NULL, " "));
        bytes[1] = 0x00;
        return 2;
    }
    else if(strcmp("push", keyWord) == 0){
        bytes[0] = 0x70;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = 0x40;
        return 2;
    }
    else if(strcmp("pop", keyWord) == 0){
        bytes[0] = 0x70;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = 0x80;
        return 2;
    }
    else if(strcmp("load", keyWord) == 0){
        bytes[0] = 0xE0;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getRegister(strtok(NULL, " ")) << 4 | getHex(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("store", keyWord) == 0){
        bytes[0] = 0xF0;
        bytes[0] |= getRegister(strtok(NULL, " "));
        bytes[1] = getRegister(strtok(NULL, " ")) << 4 | getHex(strtok(NULL, " "));
        return 2;
    }
    else if(strcmp("jump", keyWord) == 0){
        bytes[0] = 0xC0;
        bytes[1] = 0x00;
        bytes[2] = 0x00;
        bytes[3] = getHex(strtok(NULL, " "));
        return 4;
    }
}

int main(int argc, char **argv) {
    FILE *src = fopen(argv[1],"r");
    FILE *dst = fopen(argv[2],"w");
    while (!feof(src)) {
        unsigned char bytes[4];
        char line[1000];
        if (NULL != fgets(line, 1000, src)) {
            printf ("read: %s\n",line);
            int byteCount = assembleLine(line,bytes);
            fwrite(bytes,byteCount,1,dst);
        }
    }
    fclose(src);
    fclose(dst);
    return 0;
}
Bret Hasel
  • 303
  • 1
  • 11
  • Perhaps because you told `strtol` to parse the input number as base 16? Your expected output would make sense if you'd parsed it as decimal and then looked at a hexdump of the binary output. Otherwise you'd expect `24` on input to be 0x24 on output, because it should parse as 2*16 + 4*1. Use a debugger to check values inside your program. – Peter Cordes Oct 20 '19 at 01:29
  • Also, if your ISA is variable-length, why are you bothering with 2 bytes of zeros in your `jump` encoding? Why not memcpy 3 bytes or something (to use host endian) or use shifts to extract the low 3 bytes of the jump offset? Or make `jump` a 1-byte displacement. – Peter Cordes Oct 20 '19 at 01:32
  • Without being a pain perhaps could i ask for an example of using shifts to extract the lower 3 bytes of the jump offset? Thats something that i think could also be useful for branches as well. If im correct, Jump is actually a 32 bit instruction, opcode first 4, then next 12 are top 12 of jump address and next 16 are lower 16 of the address. I inserted the double 0's in there mostly as placeholders but i know thats not right, im just unsure of how to be able to extract that jump address while maintaining the 32 bit structure for output of `C0 00 00 18` for this case – Bret Hasel Oct 20 '19 at 01:36
  • the variable length comes from the differing of the length of instruction, some are 16 bit some are 32 – Bret Hasel Oct 20 '19 at 01:36
  • Yeah I can see that some are 16. Unless your current implementation is just a placeholder (which apparently is the case, but there's not even a TODO comment on it), my point was that you should make `jump` a 16-bit instruction instead of wasting 2 bytes on zeros. re: shifts: [How to write endian agnostic C/C++ code?](//stackoverflow.com/q/13994674) – Peter Cordes Oct 20 '19 at 01:42
  • The jump instruction itself is a 32-bit instruction though by the nature of assembly, me reconstructing that to be a 16-bit isnt an option. Thats why i was wondering how i could do this in a way that checked the top 12 bits, clearly in this case setting them to 0's and then checked the bottom 16 bits which should in this case be `00 18`. I am just unsure how to produce this – Bret Hasel Oct 20 '19 at 01:45
  • This is a toy ISA you're inventing, isn't it? Anyway, not that it matters, clearly you *want* a large immediate for your jump instruction so long-range jumps are encodeable. Some variable-length ISAs (e.g. x86) have both a short (imm8) and large (imm32) jump. – Peter Cordes Oct 20 '19 at 02:07
  • Figured out my hex issues with some reworking but im a bit confused still on how to check the top 16 bits to ensure they are 0's or if perhaps they arent. How can i do this? – Bret Hasel Oct 20 '19 at 02:27
  • Right shift by 16 and look at what's left. – Peter Cordes Oct 20 '19 at 02:44
  • would i do this by per say `bytes[1] = getHex(strtok(NULL, " ")) >> 16` ? When i attempted this it gave me a segmentation fault dump – Bret Hasel Oct 20 '19 at 02:51
  • No, you'd read one `uint32_t` into a tmp variable and extract the bytes of that. Obviously you can't keep calling `strtok` multiple times. (Then use a debugger to look at your tmp vars.) – Peter Cordes Oct 20 '19 at 03:30
  • ahhh okay so i strtok once, get the 24. convert that to hex, then i store it into a temp var, extract the bytes that way and use the variables with the extracted bytes as the inputs for bytes 1-3? Is there anychance you can show me a mock example to go off of for this? – Bret Hasel Oct 20 '19 at 03:33
  • What i had just been trying is to store it all into byte 0 and do a shift right 28 but that didnt work as planned with `bytes[0] = 0xC0` `bytes0 |= getHex(strtok(NULL, " ")) >> 28` – Bret Hasel Oct 20 '19 at 03:34
  • [This answer](https://stackoverflow.com/questions/13994674/how-to-write-endian-agnostic-c-c-code/13995796#13995796) is what I told you to look at earlier. – Peter Cordes Oct 20 '19 at 03:54

0 Answers0