Having looked at the solutions, I took your concerns about performance to heart and decided to see if we could do better.
Interestingly my attempts to optimise with constexpr had varying results depending on the compiler.
I'll compare the output of gcc 5.3 and apple clang here:
here was my solution:
#include <utility>
#include <tuple>
#include <iostream>
template<class Tuple, size_t Index>
void* get_address(Tuple& t)
{
return std::addressof(std::get<Index>(t));
}
template <size_t ... Indexes, class Tuple>
constexpr void* get_element_pointer(Tuple & t,
size_t idx,
std::index_sequence<Indexes...>)
{
using function_type = void* (*)(Tuple&);
function_type constexpr ptrs[] =
{
&get_address<Tuple, Indexes>...
};
return ptrs[idx](t);
}
template<class Tuple>
__attribute__((noinline))
constexpr
void * get_element_pointer(Tuple& t, size_t index)
{
return get_element_pointer(t,
index,
std::make_index_sequence<std::tuple_size<Tuple>::value>());
}
int main()
{
std::tuple<int, int, int, int, int, int, int , int, int, int> x;
x = std::make_tuple(4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
std::cout << *reinterpret_cast<int*>(get_element_pointer(x, 1)) << std::endl;
}
(compiled with -O2 -fomit-frame-pointer for clarity)
clang's solution was this:
__Z19get_element_pointerINSt3__15tupleIJiiiiiiiiiiEEEEPvRT_m:
.align 4, 0x90
leaq __ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs(%rip), %rax
jmpq *(%rax,%rsi,8) ## TAILCALL
which as expected refers to a compile-time generated jump table:
__ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs:
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm0EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm1EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm2EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm3EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm4EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm5EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm6EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm7EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm8EEPvRT_
.quad __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm9EEPvRT_
where each accessor function is trivial (example of one provided):
__Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm2EEPvRT_:
leaq 8(%rdi), %rax
retq
This is was I assumed the compiler would do, being "what I would do if I were writing machine code"
However gcc seems to miss an opportunity to optimise the jump table and builds it in memory before using it!
void* get_element_pointer<std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long):
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 0ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -88(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 1ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -80(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 2ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -72(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 3ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -64(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 4ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -56(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 5ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -48(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 6ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -40(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 7ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -32(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 8ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -24(%rsp)
movq void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 9ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -16(%rsp)
movq -88(%rsp,%rsi,8), %rax
jmp *%rax
before calling a similar trivial accessor:
void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 3ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&):
leaq 24(%rdi), %rax
ret
So undeterred, I wondered whether constant folding in a non-constexpr implementation might do better:
template <size_t ... Indexes, class Tuple>
void* get_element_pointer(Tuple & t,
size_t idx,
std::index_sequence<Indexes...>)
{
using function_type = void* (*)(Tuple&);
function_type static const ptrs[] =
{
&get_address<Tuple, Indexes>...
};
return ptrs[idx](t);
}
Turns out it did - I now get the same code on gcc as clang produced with the constexpr solution:
void* get_element_pointer<std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long):
movq void* get_element_pointer<0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, 7ul, 8ul, 9ul, std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long, std::integer_sequence<unsigned long, 0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, 7ul, 8ul, 9ul>)::ptrs(,%rsi,8), %rax
jmp *%rax
what did clang make of this?
__Z19get_element_pointerINSt3__15tupleIJiiiiiiiiiiEEEEPvRT_m:
movq __ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs@GOTPCREL(%rip), %rax
jmpq *(%rax,%rsi,8) ## TAILCALL
Happily the same result.
So here's the final, provably optimal solution:
template<class Tuple, size_t Index>
void* get_address(Tuple& t)
{
return std::addressof(std::get<Index>(t));
}
template <size_t ... Indexes, class Tuple>
void* get_element_pointer(Tuple & t,
size_t idx,
std::index_sequence<Indexes...>)
{
using function_type = void* (*)(Tuple&);
function_type static const ptrs[] =
{
&get_address<Tuple, Indexes>...
};
return ptrs[idx](t);
}
template<class Tuple>
__attribute__((noinline))
constexpr
void * get_element_pointer(Tuple& t, size_t index)
{
return get_element_pointer(t,
index,
std::make_index_sequence<std::tuple_size<Tuple>::value>());
}