5

How do I thunk an arbitrary function with an arbitrary (fixed) number of arguments, on x86 and x64?

(I don't need floating-point, SSE, or the like. The arguments are all integers or pointers.)

user541686
  • 205,094
  • 128
  • 528
  • 886

2 Answers2

5

Here's my generic implementation.

I initially made it with AsmJit, then modified it by hand to remove the dependency.

  • It works for both x86 and x64!

  • It works for both cdecl and stdcall!
    It should also work for "thiscall", both on VC++ and GCC, but I haven't tested it.
    (VC++ would probably not touch the 'this' pointer, whereas GCC would treat it as the first argument.)

  • It can bind an arbitrary number of arguments at any position in the parameter list!

Just beware:

  • It does not work for variadic functions, like printf.
    Doing so would either require you to provide the number of arguments dynamically (which is painful) or would require you to store the return-pointers somewhere other than the stack, which is complicated.

  • It was not designed for ultra-high performance, but it should still be fast enough.
    The speed is O(total parameter count), not O(bound parameter count).

Scroll to the right to see the assembly code.

#include <stddef.h>

size_t vbind(
    void *(/* cdecl, stdcall, or thiscall */ *f)(), size_t param_count,
    unsigned char buffer[/* >= 128 + n * (5 + sizeof(int) + sizeof(void*)) */],
    size_t const i, void *const bound[], unsigned int const n, bool const thiscall)
{
    unsigned char *p = buffer;
    unsigned char s = sizeof(void *);
    unsigned char b = sizeof(int) == sizeof(void *) ? 2 : 3;  // log2(sizeof(void *))
    *p++ = 0x55;                                                                          // push     rbp
    if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xEC;                                 // mov      rbp, rsp
    if (b > 2)
    {
        *p++ = 0x48; *p++ = 0x89; *p++ = 0x4C; *p++ = 0x24; *p++ = 2 * s;                 // mov      [rsp + 2 * s], rcx
        *p++ = 0x48; *p++ = 0x89; *p++ = 0x54; *p++ = 0x24; *p++ = 3 * s;                 // mov      [rsp + 3 * s], rdx
        *p++ = 0x4C; *p++ = 0x89; *p++ = 0x44; *p++ = 0x24; *p++ = 4 * s;                 // mov      [rsp + 4 * s], r8
        *p++ = 0x4C; *p++ = 0x89; *p++ = 0x4C; *p++ = 0x24; *p++ = 5 * s;                 // mov      [rsp + 5 * s], r9
    }
    if (b > 2) { *p++ = 0x48; } *p++ = 0xBA; *(*(size_t **)&p)++ = param_count;           // mov      rdx, <param_count>
    if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xC2;                                 // mov      rax, rdx
    if (b > 2) { *p++ = 0x48; } *p++ = 0xC1; *p++ = 0xE0; *p++ = b;                       // shl      rax, log2(sizeof(void *))
    if (b > 2) { *p++ = 0x48; } *p++ = 0x2B; *p++ = 0xE0;                                 // sub      rsp, rax
    *p++ = 0x57;                                                                          // push     rdi
    *p++ = 0x56;                                                                          // push     rsi
    *p++ = 0x51;                                                                          // push     rcx
    *p++ = 0x9C;                                                                          // pushfq
    if (b > 2) { *p++ = 0x48; } *p++ = 0xF7; *p++ = 0xD8;                                 // neg      rax
    if (b > 2) { *p++ = 0x48; } *p++ = 0x8D; *p++ = 0x7C; *p++ = 0x05; *p++ = 0x00;       // lea      rdi, [rbp + rax]
    if (b > 2) { *p++ = 0x48; } *p++ = 0x8D; *p++ = 0x75; *p++ = 2 * s;                   // lea      rsi, [rbp + 10h]
    if (b > 2) { *p++ = 0x48; } *p++ = 0xB9; *(*(size_t **)&p)++ = i;                     // mov      rcx, <i>
    if (b > 2) { *p++ = 0x48; } *p++ = 0x2B; *p++ = 0xD1;                                 // sub      rdx, rcx
    *p++ = 0xFC;                                                                          // cld
    *p++ = 0xF3; if (b > 2) { *p++ = 0x48; } *p++ = 0xA5;                                 // rep movs [rdi], [rsi]
    for (unsigned int j = 0; j < n; j++)
    {
        unsigned int const o = j * sizeof(p);
        if (b > 2) { *p++ = 0x48; } *p++ = 0xB8; *(*(void ***)&p)++ = bound[j];           // mov      rax, <arg>
        if (b > 2) { *p++ = 0x48; } *p++ = 0x89; *p++ = 0x87; *(*(int **)&p)++ = o;       // mov      [rdi + <iArg>], rax
    }
    if (b > 2) { *p++ = 0x48; } *p++ = 0xB8; *(*(size_t **)&p)++ = n;                     // mov      rax, <count>
    if (b > 2) { *p++ = 0x48; } *p++ = 0x2B; *p++ = 0xD0;                                 // sub      rdx, rax
    if (b > 2) { *p++ = 0x48; } *p++ = 0xC1; *p++ = 0xE0; *p++ = b;                       // shl      rax, log2(sizeof(void *))
    if (b > 2) { *p++ = 0x48; } *p++ = 0x03; *p++ = 0xF8;                                 // add      rdi, rax
    if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xCA;                                 // mov      rcx, rdx
    *p++ = 0xF3; if (b > 2) { *p++ = 0x48; } *p++ = 0xA5;                                 // rep movs [rdi], [rsi]
    *p++ = 0x9D;                                                                          // popfq
    *p++ = 0x59;                                                                          // pop      rcx
    *p++ = 0x5E;                                                                          // pop      rsi
    *p++ = 0x5F;                                                                          // pop      rdi
    if (b > 2)
    {
        *p++ = 0x48; *p++ = 0x8B; *p++ = 0x4C; *p++ = 0x24; *p++ = 0 * s;                 // mov      rcx, [rsp + 0 * s]
        *p++ = 0x48; *p++ = 0x8B; *p++ = 0x54; *p++ = 0x24; *p++ = 1 * s;                 // mov      rdx, [rsp + 1 * s]
        *p++ = 0x4C; *p++ = 0x8B; *p++ = 0x44; *p++ = 0x24; *p++ = 2 * s;                 // mov      r8 , [rsp + 2 * s]
        *p++ = 0x4C; *p++ = 0x8B; *p++ = 0x4C; *p++ = 0x24; *p++ = 3 * s;                 // mov      r9 , [rsp + 3 * s]
        *p++ = 0x48; *p++ = 0xB8; *(*(void *(***)())&p)++ = f;                            // mov      rax, <target_ptr>
        *p++ = 0xFF; *p++ = 0xD0;                                                         // call     rax
    }
    else
    {
        if (thiscall) { *p++ = 0x59; }                                                    // pop      rcx
        *p++ = 0xE8; *(*(ptrdiff_t **)&p)++ = (unsigned char *)f - p
#ifdef _MSC_VER
                - s  // for unknown reasons, GCC doesn't like this
#endif
            ;                                                                             // call     <fn_rel>
    }
    if (b > 2) { *p++ = 0x48; } *p++ = 0x8B; *p++ = 0xE5;                                            // mov      rsp, rbp
    *p++ = 0x5D;                                                                          // pop      rbp
    *p++ = 0xC3;                                                                          // ret
    return p - &buffer[0];
}

Example (for Windows):

#include <assert.h>
#include <stdio.h>
#include <Windows.h>
void *__cdecl test(void *value, void *x, void *y, void *z, void *w, void *u)
{
    if (u > 0) { test(value, x, y, z, w, (void *)((size_t)u - 1)); }
    printf("Test called! %p %p %p %p %p %p\n", value, x, y, z, w, u);
    return value;
}
struct Test
{
    void *local;
    void *operator()(void *value, void *x, void *y, void *z, void *w, void *u)
    {
        if (u > 0) { (*this)(value, x, y, z, w, (void *)((size_t)u - 1)); }
        printf("Test::operator() called! %p %p %p %p %p %p %p\n", local, value, x, y, z, w, u);
        return value;
    }
};
int main()
{
    unsigned char thunk[1024]; unsigned long old;
    VirtualProtect(&thunk, sizeof(thunk), PAGE_EXECUTE_READWRITE, &old);
    void *args[] = { (void *)0xBAADF00DBAADF001, (void *)0xBAADF00DBAADF002 };
    void *(Test::*f)(void *value, void *x, void *y, void *z, void *w, void *u) = &Test::operator();
    Test obj = { (void *)0x1234 };
    assert(sizeof(f) == sizeof(void (*)()));  // virtual function are too big, they're not supported :(
    vbind(*(void *(**)())&f, 1 + 6, thunk, 1 + 1, args, sizeof(args) / sizeof(*args), true);
    ((void *(*)(void *, int, int, int, int))&thunk)(&obj, 3, 4, 5, 6);
    vbind((void *(*)())test, 6, thunk, 1, args, sizeof(args) / sizeof(*args), false);
    ((void *(*)(int, int, int, int))&thunk)(3, 4, 5, 6);
}
user541686
  • 205,094
  • 128
  • 528
  • 886
  • `log2 sizeof(int)` is not guaranteed to be `2` – obataku Aug 27 '12 at 04:43
  • 1
    Changing memory protection just to thunk a function call is ugly at best, and changing protection of the stack itself is dangerous. – James McNellis Aug 27 '12 at 04:46
  • 1
    @JamesMcNellis: The example is just that: an **example**, to demonstrate how you can use it. I never claimed it follows the best coding practices (and I never intended it to). – user541686 Aug 27 '12 at 04:47
  • This violates all sorts of rules for 64-bit Windows. The epilog [contains illegal instructions](http://msdn.microsoft.com/en-us/library/vstudio/tawsa7cb.aspx). Calculations are not allowed in the prologue. `pushfq` is not allowed either. And it doesn't register any unwind codes with the operation system for exception handling. – Raymond Chen Nov 11 '13 at 02:18
  • @RaymondChen: I had no idea about most of those rules, thanks for letting me know. (Regarding exceptions though: I never intended this to be used for code that uses exceptions, so I'm ignoring everything related to exception-safety here.) I have a question though: what will go wrong if the prolog and epilog don't follow the convention? Is it only related to exception-safety, or does it cause problems even when no exceptions are ever thrown? – user541686 Nov 11 '13 at 02:29
  • You don't really control whether this code is used with exceptions because exceptions are not within your control. Deadlock warning, invalid handle warning, stack overflow warning, copy on write, all these things can happen and the kernel will handle the exception for you. (Also, I think you misaligned the stack.) – Raymond Chen Nov 11 '13 at 04:44
  • @RaymondChen: Hmm, when you say "warning", are you referring to guard pages, or to something else? (And thanks for the note... I won't have time to look into fixing the stack alignment right now but I'll try to fix it later.) – user541686 Nov 11 '13 at 04:59
  • 1
    Deadlock warning is when an EnterCriticalSection blocks for too long. Invalid handle warning is when you pass an invalid handle to CloseHandle. Resource copy on write raises an access violation. All of them are handled by the default unhandled exception filter. Those are just the ones I could think of off the top of my head. There are probably others too. If you don't register your function with a proper prolog, the kernel will terminate the process when any of these things happen because it can't walk the stack. – Raymond Chen Nov 11 '13 at 05:52
  • @RaymondChen: Ah interesting, I didn't know that, thanks a lot for pointing it out. When I get the chance again I'll fix it (but it might take another half a year or a year or so for me to get the time to look into it...). – user541686 Nov 11 '13 at 06:55
0

Here is a modification for thiscall functions

The vbind() stub generator above is meant to be used for C++ member functions as well, although it is not clear how to proceed. Here's what I've come up with:

// experimental x64 thiscall thunking
class TestHook {
public:
    typedef void (TestHook::*TMFP)();

    TestHook(DWORD num) 
    {
        m_context = num;

        union { void* (*func)(); TMFP method; } addr;
        addr.method = (TMFP)CBTHook_stub;

        // pass "this" as the first fixed argument
        void *args[] = { this };
        size_t thunk_size = vbind(addr.func, 4, m_thunk, 0, args, 1);
        ATLASSERT(thunk_size < sizeof(m_thunk));

        unsigned long old;
        VirtualProtect(m_thunk, thunk_size, PAGE_EXECUTE_READWRITE, &old);
        FlushInstructionCache(GetCurrentProcess(), m_thunk, thunk_size);
    }

    FARPROC GetThunk() const {    return (FARPROC)(void*)m_thunk; }

protected:
    // test thiscall: one integer and two 8-byte arguments
    LRESULT CBTHook_stub(int nCode, WPARAM wParam, LPARAM lParam) 
    {
        ATLTRACE(_T("this=%p, code=%d, wp=%x, lp=%x, context=%x\n"), this, nCode, wParam, lParam, m_context);
        return lParam;
    }

    DWORD m_context;
    unsigned char m_thunk[1024]; // fixed; don't know size required apriori!
};

#ifndef _WIN64
#error does not work for win32
#endif
void main(void)
{
    TestHook tmp(0xDeadBeef);

    HOOKPROC proc = (HOOKPROC)tmp.GetThunk();
    ATLTRACE(_T("object %p return value=%d\n"), &tmp, proc(1, 2, 3));
}

I am not an assembly gury but this code correctly stubs into the member function for 64 bit code. There are some implicit assumptions (I'm not 100% sure if valid, please correct me if I'm wrong):

  1. in x64 (amd / microsoft VS) all function arguments are passed as 8 bytes long. So although vbind was just for pointer-type arguments, it is possible to thunk into other function prototypes (e.g. the HOOKPROC takes one integer and two __int64)

  2. "this" pointer is passed as the first stack argument in x64 instead of ECX. I used the bounded argument to pass "this" pointer and provide context to the C++ object

nikos
  • 532
  • 3
  • 10
  • There was a bug in the 32-bit `thiscall`, it should be fixed now. – user541686 Nov 10 '13 at 23:50
  • your modifications were just for the 32 bit version, for 64 bits it GPFs. Note that the 32 bit thunking is much better handled in CAuxThunk, part of the [ATL/AUX library](http://www.codeproject.com/Articles/423/ATL-AUX-Library) – nikos Nov 11 '13 at 09:59