I was thinking again about implementing the quadratic sieve for fun, which requires Guassian elimination over a binary field, that is the operations required are 1. swapping rows and 2. XORing rows.
My ideas were either to maintain a bit array using a vector of 64-bit ints and bit twiddling, or use vector<bool>
, which is probably space-optimized on my system. The bit array must be able to be dynamically sized, so std::bitset
won't work. The advantage of maintaining my own ints is that I can XOR 64 bits at a time which is a neat trick. I wanted to see what a compiler would do for a loop that XOR'd bool vectors: (I wasn't able to use ^=
, see operator |= on std::vector<bool>)
void xor_vector(std::vector<bool>& a, std::vector<bool>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
I have a very basic understanding of x86 but it looks like the compiler isn't actually XORing words together? Is there a way to get the compiler to XOR entire words at a time?
https://godbolt.org/z/PbGdv3sKT
xor_vector(std::vector<bool, std::allocator<bool> >&, std::vector<bool, std::allocator<bool> >&):
mov r8, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+16]
mov edx, DWORD PTR [rdi+24]
sub rax, r8
lea rdi, [rdx+rax*8]
test rdi, rdi
je .L11
push rbp
mov r10d, 1
push rbx
mov r9, QWORD PTR [rsi]
xor esi, esi
jmp .L7
.L16:
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11]
mov rbp, rdx
test rdx, rcx
setne bl
and rbp, QWORD PTR [rax]
setne bpl
.L4:
mov rax, rdx
not rdx
or rax, rcx
and rdx, rcx
cmp bpl, bl
cmovne rdx, rax
add rsi, 1
mov QWORD PTR [r11], rdx
cmp rsi, rdi
je .L15
.L7:
test rsi, rsi
lea rax, [rsi+63]
mov rdx, rsi
cmovns rax, rsi
sar rdx, 63
shr rdx, 58
sar rax, 6
lea rcx, [rsi+rdx]
sal rax, 3
and ecx, 63
lea r11, [r8+rax]
add rax, r9
sub rcx, rdx
jns .L16
add rcx, 64
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11-8]
mov rbp, rdx
test rcx, rdx
setne bl
and rbp, QWORD PTR [rax-8]
setne bpl
sub r11, 8
jmp .L4
.L15:
pop rbx
pop rbp
ret
.L11:
ret
My question is similar to bitwise operations on vector<bool> but the answers are dated and don't seem to answer my question.
Update: I tested with a 256 bit sized bitset too. Still I don't see XORing whole machine words.
void xor_vector(std::bitset<256>& a, std::bitset<256>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
https://godbolt.org/z/jKEf89E1j
xor_vector(std::bitset<256ul>&, std::bitset<256ul>&):
push rbx
mov r8, rdi
mov r11, rsi
xor edx, edx
mov ebx, 1
.L4:
mov rsi, rdx
mov rcx, rdx
mov rax, rbx
shr rsi, 6
and ecx, 63
sal rax, cl
mov rdi, QWORD PTR [r8+rsi*8]
mov rcx, rax
and rcx, QWORD PTR [r11+rsi*8]
mov rcx, rax
setne r10b
test rax, rdi
not rax
setne r9b
or rcx, rdi
and rax, rdi
cmp r10b, r9b
cmovne rax, rcx
add rdx, 1
mov QWORD PTR [r8+rsi*8], rax
cmp rdx, 256
jne .L4
pop rbx
ret