2

I am starting a Sudoku program and am experimenting with efficiently copying an unsigned int[] into a char[] while viewing the resulting AT&T x86-64 assembly objdump -d result. The point of what I am doing is to produce an output like this (which my code currently does) for learning purposes:

 ------- ------- ------- 
|       | 2   7 |       |
|   1 5 |       | 7 8   |
|   6   | 4   1 |   3   |
 ------- ------- ------- 
|     4 |       | 9     |
| 1 9   |   3   |   2 5 |
|       |       |       |
 ------- ------- ------- 
|   8   |   9   |   1   |
|     2 | 6   8 | 3     |
| 6     |       |     4 |
 ------- ------- ------- 

I am using an unsigned int[] to represent the board and have a static char[] within the printBoard function for the purpose of displaying the board when I call printBoard(const unsigned int* ac_board). What I was hoping for was having the static char[] be initialized with the formatting needed and have 81 copies from the unsigned int[] into the static char[] each time I call printBoard

At the moment I have not been able to achieve this, I have 81 * 2 copies occurring in my code following the pattern (I am compiling as g++ -std=c++14 -O3 "$1" -o "$2"):

...
   100000adc:   88 05 ba 05 00 00       mov    %al,0x5ba(%rip)        # 10000109c <__ZZ10printBoardPKjE10s_printBuf+0x1c>
   100000ae2:   8b 47 04                mov    0x4(%rdi),%eax
   100000ae5:   88 05 b3 05 00 00       mov    %al,0x5b3(%rip)        # 10000109e <__ZZ10printBoardPKjE10s_printBuf+0x1e>
   100000aeb:   8b 47 08                mov    0x8(%rdi),%eax
   100000aee:   88 05 ac 05 00 00       mov    %al,0x5ac(%rip)        # 1000010a0 <__ZZ10printBoardPKjE10s_printBuf+0x20>
   100000af4:   8b 47 0c                mov    0xc(%rdi),%eax
   100000af7:   88 05 a7 05 00 00       mov    %al,0x5a7(%rip)        # 1000010a4 <__ZZ10printBoardPKjE10s_printBuf+0x24>
   100000afd:   8b 47 10                mov    0x10(%rdi),%eax
   100000b00:   88 05 a0 05 00 00       mov    %al,0x5a0(%rip)        # 1000010a6 <__ZZ10printBoardPKjE10s_printBuf+0x26>
   100000b06:   8b 47 14                mov    0x14(%rdi),%eax
...

Is there a way to actually achieve 81 mov instructions rather than 81 * 2?

Below is my main.cpp:

#include <ctime>

#include "print.hpp"


// -----------------------------------------------------------------------------
unsigned int g_board[81] = {
    0x20, 0x20, 0x20, 0x32, 0x20, 0x37, 0x20, 0x20, 0x20,
    0x20, 0x31, 0x35, 0x20, 0x20, 0x20, 0x37, 0x38, 0x20,
    0x20, 0x36, 0x20, 0x34, 0x20, 0x31, 0x20, 0x33, 0x20,
    0x20, 0x20, 0x34, 0x20, 0x20, 0x20, 0x39, 0x20, 0x20,
    0x31, 0x39, 0x20, 0x20, 0x33, 0x20, 0x20, 0x32, 0x35,
    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    0x20, 0x38, 0x20, 0x20, 0x39, 0x20, 0x20, 0x31, 0x20,
    0x20, 0x20, 0x32, 0x36, 0x20, 0x38, 0x33, 0x20, 0x20,
    0x36, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x34
};

// -----------------------------------------------------------------------------
int main() {
    ::std::clock_t begin = clock();
    printBoard(g_board);
    ::std::clock_t end = clock();
    ::std::cout << (end - begin) << ::std::endl;

    return 0;
}

Here is my current best implementation of print.hpp:

#include <iostream>


// -----------------------------------------------------------------------------
void printBoard(const unsigned int* ac_board) {
    static char s_printBuf[] = {
        ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '-', '-', '-', '-',
        '-', '-', '-', ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '-', '-', '-', '-',
        '-', '-', '-', ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '-', '-', '-', '-',
        '-', '-', '-', ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x',
        ' ', 'x', ' ', '|', ' ', 'x', ' ', 'x', ' ', 'x', ' ', '|', '\n',
        ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '-', '-', '-', '-',
        '-', '-', '-', ' ', '-', '-', '-', '-', '-', '-', '-', ' ', '\n',
        0
    };

    s_printBuf[28]  = ac_board[0];
    s_printBuf[30]  = ac_board[1];
    s_printBuf[32]  = ac_board[2];
    s_printBuf[36]  = ac_board[3];
    s_printBuf[38]  = ac_board[4];
    s_printBuf[40]  = ac_board[5];
    s_printBuf[44]  = ac_board[6];
    s_printBuf[46]  = ac_board[7];
    s_printBuf[48]  = ac_board[8];

    s_printBuf[54]  = ac_board[9];
    s_printBuf[56]  = ac_board[10];
    s_printBuf[58]  = ac_board[11];
    s_printBuf[62]  = ac_board[12];
    s_printBuf[64]  = ac_board[13];
    s_printBuf[66]  = ac_board[14];
    s_printBuf[70]  = ac_board[15];
    s_printBuf[72]  = ac_board[16];
    s_printBuf[74]  = ac_board[17];

    s_printBuf[80]  = ac_board[18];
    s_printBuf[82]  = ac_board[19];
    s_printBuf[84]  = ac_board[20];
    s_printBuf[88]  = ac_board[21];
    s_printBuf[90]  = ac_board[22];
    s_printBuf[92]  = ac_board[23];
    s_printBuf[96]  = ac_board[24];
    s_printBuf[98]  = ac_board[25];
    s_printBuf[100] = ac_board[26];

    s_printBuf[132] = ac_board[27];
    s_printBuf[134] = ac_board[28];
    s_printBuf[136] = ac_board[29];
    s_printBuf[140] = ac_board[30];
    s_printBuf[142] = ac_board[31];
    s_printBuf[144] = ac_board[32];
    s_printBuf[148] = ac_board[33];
    s_printBuf[150] = ac_board[34];
    s_printBuf[152] = ac_board[35];

    s_printBuf[158] = ac_board[36];
    s_printBuf[160] = ac_board[37];
    s_printBuf[162] = ac_board[38];
    s_printBuf[166] = ac_board[39];
    s_printBuf[168] = ac_board[40];
    s_printBuf[170] = ac_board[41];
    s_printBuf[174] = ac_board[42];
    s_printBuf[176] = ac_board[43];
    s_printBuf[178] = ac_board[44];

    s_printBuf[184] = ac_board[45];
    s_printBuf[186] = ac_board[46];
    s_printBuf[188] = ac_board[47];
    s_printBuf[192] = ac_board[48];
    s_printBuf[194] = ac_board[49];
    s_printBuf[196] = ac_board[50];
    s_printBuf[200] = ac_board[51];
    s_printBuf[202] = ac_board[52];
    s_printBuf[204] = ac_board[53];

    s_printBuf[236] = ac_board[54];
    s_printBuf[238] = ac_board[55];
    s_printBuf[240] = ac_board[56];
    s_printBuf[244] = ac_board[57];
    s_printBuf[246] = ac_board[58];
    s_printBuf[248] = ac_board[59];
    s_printBuf[252] = ac_board[60];
    s_printBuf[254] = ac_board[61];
    s_printBuf[256] = ac_board[62];

    s_printBuf[262] = ac_board[63];
    s_printBuf[264] = ac_board[64];
    s_printBuf[266] = ac_board[65];
    s_printBuf[270] = ac_board[66];
    s_printBuf[272] = ac_board[67];
    s_printBuf[274] = ac_board[68];
    s_printBuf[278] = ac_board[69];
    s_printBuf[280] = ac_board[70];
    s_printBuf[282] = ac_board[71];

    s_printBuf[288] = ac_board[72];
    s_printBuf[290] = ac_board[73];
    s_printBuf[292] = ac_board[74];
    s_printBuf[296] = ac_board[75];
    s_printBuf[298] = ac_board[76];
    s_printBuf[300] = ac_board[77];
    s_printBuf[304] = ac_board[78];
    s_printBuf[306] = ac_board[79];
    s_printBuf[308] = ac_board[80];

    ::std::cout << s_printBuf;
}
asimes
  • 5,749
  • 5
  • 39
  • 76
  • If you are doing so, you should think from a point of assembly code. If there is no instruction, that copies value from one point to another, then I fear there is no way You can do this. There are also DMAs, that copy data from one place in memory to another (did you try memcpy?, maybe memcopy uses DMA) in some uProcessors, but as you are running OS you may not have direct access to them (it's up to OS if they should be used). Also note, that 03 optimizes mainly for speed and not number of instructions – DawidPi Oct 08 '15 at 06:39
  • @DawidPi, I just tried googling DMA but did not find a relevant result, what does that mean? – asimes Oct 08 '15 at 06:40
  • @DawidPi, Also I do not understand why the compiler is not able to realize that only two memory addresses are relevant (the addresses of the two arrays) and that constant offsets could be used for N mov instructions rather than N * 2 mov instructions – asimes Oct 08 '15 at 06:43
  • These are Direct Memory Access devices, what they do is copy data from one place to another. You set them by saying to them from where they should copy data, to where and how many, after this processor is freed from executing any copying instructions and can do it's further job, while DMA is copying. Usually DMA is used when copying data to HDD, flash memories and so on, but I think they can be used as well for mem to mem copying – DawidPi Oct 08 '15 at 06:44
  • 3
    Why do you use `unsigned int g_board[81]` instead of `char`? If you're prepared to copy directly a la `s_printBuf[28] = ac_board[0];` then the "board" is obviously ASCII and hence `char` is the natural type. – Tony Delroy Oct 08 '15 at 06:44
  • Also try to copy table with for loop. Compilers can do amazing job with loops, however I do not know if it will result in less instructions code (additional check will be required), but you may try. – DawidPi Oct 08 '15 at 06:50
  • @TonyD, The values I use for the `unsigned int` representation could have been `0`, `1`, `2`, etc. or whatever but because it does not matter I used ASCII values for the sake of printing quickly. I did not make the type `char` because the remainder of the program which does depth searching will operate on `unsigned int` types – asimes Oct 08 '15 at 06:51
  • @asimes It's not like it's not able to realize this, but according instruction set for x64 architecture, there is no instruction to copy data from one place to another directly (it's very common) instead you usually have to copy data from copy place to the register and from register to paste place, and this is why you have 2*81 instructions – DawidPi Oct 08 '15 at 06:54
  • @DawidPi, I did not realize that memory to memory mov instructions were not allowed, is that really true? If that is the case then maybe N * 2 is the best that can be done – asimes Oct 08 '15 at 07:01
  • Anyway, it's normal for CPUs to only support copy between memory via registers - see e.g. [here](http://stackoverflow.com/questions/3686333/moving-data-from-memory-to-memory-in-micro-controllers). If you care so much about efficiency, you could explore bit-shifting a la `ac_board[0] | (ac_board[1] << 16)` but you'd need to use a `union` of `char[]` and `unsigned[]` for `s_printBuf`. (Whole thing's a waste of time really - if you want to learn about this, just write assembly - whatever you do here's just catering to one compiler/versions/switches optimisation quirks.) – Tony Delroy Oct 08 '15 at 07:01
  • It generally depends on processor's architecture, but mainly it's just like Tony D said. Well known processors as far as I know does not have instruction, that would copy data without register instruction. – DawidPi Oct 08 '15 at 07:03
  • @TonyD, I guess it makes sense that a CPU would not be able to access two memory locations at the same time although I still would have thought that the address of say `ac_board` would be stored in a register and then N copies could be referenced by constant offsets based on the address loaded into the register. Oh well – asimes Oct 08 '15 at 07:05
  • How about trying to use unsigned chars to store values? then one register could copy more than one number. You are not using more space than uint8_t. You would have to use some hacks and nonclean code. Please try to use memcpy(), and try to use for loop (with values as uint8_t). Then please post an answer. – DawidPi Oct 08 '15 at 07:06
  • @asimes: *"address of say ac_board would be stored in a register"* - that is what's happening... consider `%rdi` in the assembly. `%rip` effectively does that too, but relative to the instruction pointer - kind of weird if you ask me, but you can read about it - e.g. [here](http://www.nynaeve.net/?p=192) - and come to your own conclusions. – Tony Delroy Oct 08 '15 at 07:09
  • 2
    @DawidPi: memcpy/loop are normally good, but do note that the `s_printBuf[]` indices being written are not contiguous, so the normal benefit doesn't exist here. asimes's effectively done manual loop unrolling, which doesn't hurt efficiency. – Tony Delroy Oct 08 '15 at 07:10
  • Oh yeah, you are right. Apologies for not noticing this – DawidPi Oct 08 '15 at 07:12

0 Answers0