0

I call a NASM dll from ctypes. In my NASM dll code, I create three arrays using malloc. Their pointers are assigned to final1_ptr, final2_ptr and final3_ptr. If I return any of the pointers individually, I get the correct results.

But I want to return an array of all three pointers. For that, I also have an array initialized in the .data section: Return_Pointer_Array: dq 0, 0, 0.

At the end I assign the pointers this way:

mov rdi,Return_Pointer_Array
mov rax,qword[final1_ptr]
mov qword [rdi],rax
mov rax,qword[final2_ptr]
mov qword [rdi+8],rax
mov rax,qword[final3_ptr]
mov qword [rdi+16],rax
mov rax,rdi
ret

However, what ctypes gets is an array of three extremely small decimals, not valid pointer addresses.

The actual program listing is very long, so I hope the code above will be enough to understand the problem.

Thanks very much for any help.

EDIT: if I return the Return_Pointer_Array without assigning any values, it returns double (floating point) values. The array is initialized as integer, so I don't understand why it defaults to floating point when it's initialized as dq 0, 0, 0.

EDIT #2: Per request, here is the Python code:

def Test_Data_Read():

#Note:  in production, X is a list of 50,000 random floats read from file; 
#For simplicity, I have reduced it to a short simple list:  

X = [11.0,1.0,2.0,7.0,4.0,4.0,4.0,6.0,7.0,6.0,11.0,4.0,10.0,7.0,8.0,4.0,9.0,4.0,5.0,3.0,4.0,1.0,2.0,5.0,3.0,5.0,11.0,10.0,11.0,9.0,3.0,12.0]

PyGram_Test_01_asm(X)

#__________

def PyGram_Test_01_asm(X):

Input_Length_Array = []
Input_Length_Array.append(len(X)*8)

CA_X = (ctypes.c_double * len(X))(*X)
length_array_out = (ctypes.c_double * len(Input_Length_Array))(*Input_Length_Array)

hDLL = ctypes.WinDLL("C:/NASM_Test_Projects/Nested_For_Loops/Nested_For_Loops.dll")
CallName = hDLL.Main_Entry_fn
CallName.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double)]
CallName.restype = ctypes.POINTER(ctypes.c_double)

start_time = timeit.default_timer()

ret_ptr = CallName(CA_X,length_array_out)

a = ret_ptr[:3]

Here is the entire NASM listing (sometimes minimal and complete are at odds with each other)

; Header Section
[BITS 64]

[default rel]

export Main_Entry_fn

extern malloc, realloc, free

section .data align=16
out_array_pointer: dq 0
call_var_length: dq 0
c_square: dq 0
final1_ptr: dq 0
final1_ctr: dq 0
final2_ptr: dq 0
final2_ctr: dq 0
final3_ptr: dq 0
final3_ctr: dq 0
float_temp_var: dq 0.0
temp_math_var: dq 0
Bool_0: dq 0.0
Bool_1: dq 1.0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
X_ptr: dq 0
X_ctr: dq 0
X: dq 0
n: dq 0
i: dq 0
a: dq 0.0
range_loop_start_a: dq 0
range_loop_end_a: dq 0
b: dq 0.0
range_loop_start_b: dq 0
range_loop_end_b: dq 0
c: dq 0
X_length: dq 0
Input_Length_Array: dq 0,
Return_Pointer_Array: dq 0, 0, 0

section .text

PyGram_Test_01_fn:
xor rcx,rcx
mov [X_ctr],rcx
label_401:
lea rdi,[rel X_ptr]
mov rbp,qword [rdi]
mov rcx,[X_ctr]
cmp rcx,[X_length]
jge exit_label_for_PyGram_Test_01_fn
movsd xmm0,[rbp+rcx]
movsd [n],xmm0
add rcx,8
mov [X_ctr],rcx
movsd xmm0,qword[n]
cvttsd2si rax,xmm0
mov [n],rax
mov rax,[n]
mov rdx,1
add rax,rdx
mov [range_loop_end_a],rax
mov rax,1
sub rax,1
mov [range_loop_start_a],rax
mov [a],rax
label_801:
mov rcx,[a]
inc rcx
cmp rcx,[range_loop_end_a]
jge label_401
mov [a],rcx
mov rax,[a]
sub rax,1
mov [range_loop_start_b],rax
mov [b],rax
mov rax,[n]
mov [range_loop_end_b],rax
label_1201:
mov rcx,[b]
inc rcx
cmp rcx,[range_loop_end_b]
jge label_801
mov [b],rcx
mov rax,[a]
mov r8,[a]
xor rcx,rcx
add rcx,1
Exponent_Label_0:
mul r8
inc rcx
mov rdx,2
cmp rcx,rdx
jl Exponent_Label_0
mov[temp_math_var],rax
mov rax,[b]
mov r8,[b]
xor rcx,rcx
add rcx,1
Exponent_Label_1:
mul r8
inc rcx
mov rdx,2
cmp rcx,rdx
jl Exponent_Label_1
mov rdx,rax
mov rax,[temp_math_var]
add rax,rdx
mov [c_square],rax
mov rax,[c_square]
mov rax,[c_square]
cvtsi2sd xmm1,rax
sqrtsd xmm0,xmm1
cvttsd2si rax,xmm0
mov [c],rax
label_1601:
mov rax,[c]
mov rax,[c]
mov r8,[c]
xor rcx,rcx
add rcx,1
Exponent_Label_2:
mul r8
inc rcx
mov rdx,2
cmp rcx,rdx
jl Exponent_Label_2
mov [temp_math_var],rax
mov rax,[c_square]
mov rdx,[temp_math_var]
sub rax,rdx
mov rdx,0
cmp rax,rdx
jne label_1201
mov rdi,[final1_ptr]
mov rcx,[final1_ctr]
mov rax,[a]
cvtsi2sd xmm0,rax
movsd [rdi + rcx],xmm0
add rcx,8
mov [final1_ctr],rcx
mov rdi,[final2_ptr]
mov rcx,[final2_ctr]
mov rax,[b]
cvtsi2sd xmm0,rax
movsd [rdi + rcx],xmm0
add rcx,8
mov [final2_ctr],rcx
mov rdi,[final3_ptr]
mov rcx,[final3_ctr]
mov rax,[c]
cvtsi2sd xmm0,rax
movsd [rdi + rcx],xmm0
add rcx,8
mov [final3_ctr],rcx
jmp label_1201
label_900:
exit_label_for_PyGram_Test_01_fn:
mov rdi,Return_Pointer_Array
mov rax,qword[final1_ptr]
mov qword [rdi],rax
mov rax,qword[final2_ptr]
mov qword [rdi+8],rax
mov rax,qword[final3_ptr]
mov qword [rdi+16],rax
mov rax,rdi
ret

; __________
; Main Entry

Main_Entry_fn:
push rdi
push rbp
mov [X_ptr],rcx
mov [data_master_ptr],rdx
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; malloc for dynamic arrays
lea rdi,[data_master_ptr]
mov rbp,[rdi]
movsd xmm0,qword[rbp]
cvttsd2si rax,xmm0
mov [initial_dynamic_length],rax
mov rax,3529984
mov [initial_dynamic_length],rax
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov qword [final1_ptr],rax
add rsp,40
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov qword [final2_ptr],rax
add rsp,40
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov qword [final3_ptr],rax
add rsp,40
; __________
call PyGram_Test_01_fn
exit_label_for_Main_Entry_fn:
pop rbp
pop rdi
ret

def PyGram_Test_01(X):

final1, final2, final3 = [],[],[]

Here's the Python source code for the assembler:

for i, n in enumerate(X):
    n = int(n)
    for a in range(1,n+1):
        for b in range(a,n):
            c_square = a**2 + b**2
            c = int(sqrt(c_square))

            if ((c_square - c**2) == 0):
                final1.append(a)
                final2.append(b)
                final3.append(c)
RTC222
  • 2,025
  • 1
  • 20
  • 53
  • Please provide your *complete* python code, and complete assembly code to make this an [mcve] – Michael Petch Mar 13 '18 at 23:02
  • @Michael Petch - thanks for the reply. I posted the code above. The NASM listing is quite long; the area at issue comes below the exit_label_for_PyGram_Test_01_fn. – RTC222 Mar 14 '18 at 00:14
  • 1
    Your python code isn't complete. Have no Idea what `X` is because you don't show us how you call PyGram_Test_01_asm – Michael Petch Mar 14 '18 at 03:27
  • @Michael Petch - I added the code that calls PyGram_Test_01_asm in the Python code above. If more is needed, please ask. Thanks very much for taking time on this. – RTC222 Mar 14 '18 at 13:10
  • The problem appears to be that Return_Pointer_Array is returned as float, not integer. If I return it without assigning any values to it, it returns an array of 0.0, 0.0, 0.0 instead of 0, 0, 0. Yet dq should return integer if defined as X dq: 0,0,0. I moved the variable to .bss and defined it as Return_Pointer_Array: resq 24 but it still returns double-precision float, not integer. Even if I define it as dd, it returns float, not integer. – RTC222 Mar 14 '18 at 15:01
  • 1
    Well you have said `CallName.restype = ctypes.POINTER(ctypes.c_double)` . This means the return value is a pointer to a double. The problem is when you return `Return_Pointer_Array` that is a pointer to an array of double pointers. So Python is attempting to convert the pointers to doubles because that is what you have told it. – Michael Petch Mar 14 '18 at 15:10
  • After a lot of trial and error, I just came to the same conclusion moments ago. I changed my return type to CallName.restype = ctypes.POINTER(ctypes.c_int64) I now get the correct answers. Thanks very much for your help on this. – RTC222 Mar 14 '18 at 15:15
  • You could try `ctypes.POINTER(ctypes.POINTER(ctypes.c_double))` This should be a pointer to a pointer to double – Michael Petch Mar 14 '18 at 15:24

1 Answers1

1

I am posting this as an answer because others may need this information. With help from Michael Petch (above) I have resolved this.

To return an array of pointers from a dll to ctypes:

  1. Declare an array in the dll (for NASM, it's declared in the .data section, e.g. Return_Pointer_Array: dq 0, 0, 0).

  2. On exit, assign the pointers to the array (for assembler; in C, use C assignments):

    mov rdi,Return_Pointer_Array
    mov rax,qword[final1_ptr]
    mov [rdi],rax
    mov rax,qword[final2_ptr]
    mov [rdi+8],rax
    mov rax,qword[final3_ptr]
    mov [rdi+16],rax
    mov rax,rdi
    
  3. Set the return type to CallName.restype = ctypes.POINTER(ctypes.c_int64).

  4. Cast each of the returned pointers and convert the data to a Python array (in this example, I know in advance how many elements there are in each array; for a dynamic array, the length can be passed back as an element of the return array):

    ret_ptr = CallName(CA_X,length_array_out)
    a = ret_ptr[:3]
    n1 = ctypes.cast(a[0], ctypes.POINTER(ctypes.c_double))
    x1 = n1[:50000]
    n2 = ctypes.cast(a[1], ctypes.POINTER(ctypes.c_double))
    x2 = n2[:50000]
    n3 = ctypes.cast(a[2], ctypes.POINTER(ctypes.c_double))
    x3 = n3[:50000]
    

Alternatively, set the return type to CallName.restype = ctypes.POINTER(ctypes.POINTER(ctypes.c_double)), as Michael Petch said above, and extract the values like this:

    ret_ptr = CallName(CA_X,length_array_out)
    a = ret_ptr[:3]
    n1 = a[0]
    x1 = n1[:50000]
    etc
RTC222
  • 2,025
  • 1
  • 20
  • 53
  • Should make you aware of something. If you `malloc` something in _C_, Python has no way of recovering that memory. If you are going to do this I'd recommend adding some type of `free` function to your DLL that takes a pointer returned from `Main_Entry_fn` that can go down the main array and call free on all the malloc'ed pointers. You will have to call this free function manually from your python code when you are finished with structure. – Michael Petch Mar 14 '18 at 19:13
  • Is there a reason you are doing this in assembler? Is this just to learn and experiment. If I write an external DLL or shared object for Python I generally do it in _C_. If there is a performance reason I may use inline code or a secondary assembly file with functionality. But my main reason for using _C_ is because it makes the code easier to maintain and read, less error prone, and it is easier to interface with the Python lib to have my function create `PyObject`'s directly. – Michael Petch Mar 14 '18 at 19:15
  • Yes, I call free() from ctypes after the array returns and is extracted. The free code in NASM was omitted from the code above for brevity. It's called for each of the arrays. I write in assembler because I've written in 32-bit assembler as part of my work for a long time; I've just switched to 64-bit NASM. I love assembler and I really do not like C as much (although C is very inspired for an effort that dates from the 60s). Also, assembler is generally faster than C, although reasonable minds differ on that point, but the proof is in the metrics. – RTC222 Mar 14 '18 at 19:32
  • I have a love of assembly and _C_. So dealing with assembly is not an issue. In the old days many compilers (including MS) were not very good at code generation. Watcom started to change that. Early versions of GCC had bugs in the optimizers, but over the past decade GCC and CLAND generally do a very good job of code generation with optimizations on. They cantake advantage of the nuances of an architecture and generate suitable instructions for it to improve performance. With vectorization (SIMD: SSE, AVX, AVX2) the code generated can outperform hand made assembly in a lot of cases. – Michael Petch Mar 14 '18 at 20:28
  • 1
    My general rule is this. Code it in _C_. Look at the genrated instrcutions. If the code produced doens't look good try to amend the _C_ code to generate better code. This cane involve reowrking code, using `builtins` to use specific instruction features, or if need be inline assembly (i don't recommend this if you don't know what you ar doing), and then if need be an assembly file for the cases where doing it manually is best. – Michael Petch Mar 14 '18 at 20:30
  • In the worst case I have both _C_ and assembly, in the best case entirely _C_. Not directly related to the question asked, do you have pseudo-code or a description of what `PyGram_Test_01_fn` is doing? – Michael Petch Mar 14 '18 at 20:32
  • I have an unusual attraction to assembler, because most people suggest C as a first choice. Vectorization does give some advantage to C but assembler can make use of AVX instructions too. I think I read recently that inline assembler is no longer supported in 64-bit Intel chips, and that's why MS dropped it from C++. For my purposes, if I need to tweak C to improve performance, I'd rather devote the time to NASM, but I know I'm a little strange that way. Assembler afficionados are few and far between, so it's a lonely world! – RTC222 Mar 14 '18 at 20:57
  • Here's the Python code for PyGram_Test_01_fn: Sorry, I can't put line breaks in a comment, so this runs on. I hope you can easily parse it out. def PyGram_Test_01(X): final1, final2, final3 = [],[],[] for i, n in enumerate(X): n = int(n) for a in range(1,n+1): for b in range(a,n): c_square = a**2 + b**2 c = int(sqrt(c_square)) if ((c_square - c**2) == 0): final1.append(a) final2.append(b) final3.append(c) – RTC222 Mar 14 '18 at 20:59
  • Inline assembly is no longer supported in 64-bit _C_ code in Microsoft C/C++ (it is still supported if targeting 32-bit). That is just for Microsoft. GCC, CLANG continue to have far better inline support (but also way too easy to get wrong). MSVC/GCC/CLANG all **try** to do vectorization if you turn on AVX/AVX2/SSE etc. Here is [an example](https://godbolt.org/g/Sfrh68) of a vectorized loop in GCC that adds 1 to every byte in an array of characters. You need to turn on with `-mavx` or `avx2` – Michael Petch Mar 14 '18 at 21:04
  • Most modern c compilers now have builtin intrinsics that wrap a lot of the SIMD(AVX/AVX2/SSE) instructions so you can generate those instructions more directly. Much of the time now intrinsics can be used to more directly generate specific instructions in the assembly code. You can use this if vectorization of the compiler doesn't do well and then turn around and code the functionality in _C_ yourself. Intrinsics for most up to date compilers support most of the AVX/AVX(and some AVX512) instructions. – Michael Petch Mar 14 '18 at 21:07
  • GCC has a lit of their supported intrinsics here: https://gcc.gnu.org/onlinedocs/gcc-7.3.0/gcc/x86-Built-in-Functions.html#x86-Built-in-Functions . Microsoft has a list of theirs here: https://msdn.microsoft.com/en-us/library/26td21ds.aspx – Michael Petch Mar 14 '18 at 21:11
  • As you can tell these intrinsics allow you t start generating pretty low level SIMD code directly if you find the code generated automatically may not be to your liking. I now favor intrinsics over assembly code in _C_ for most cases. – Michael Petch Mar 14 '18 at 21:12
  • Sorry, the Python code doesn't reproduce correctly. The line should read a-squared + b-squared. Currently I'm using the NASM compliler and GoLink, but for linux I like to use GCC. – RTC222 Mar 14 '18 at 21:57
  • I just added the Python source code to the question above. – RTC222 Mar 14 '18 at 22:08
  • Yes, I know saw that, now I understand what a2 and b2 were. there were double asterisk removed as formatting here. I understand your comment now. It makes sense. – Michael Petch Mar 14 '18 at 22:09
  • 1
    *inline assembler is no longer supported in 64-bit Intel chips, and that's why MS dropped it from C++*. How would that even be plausible? The hardware doesn't know what produced the instruction bytes it's executing!! Inline-asm ends up mixed with compiler-generated asm instructions, and those are all assembled to machine code. MSVC dropped inline asm because [the implementation in their compiler is a total hack, and it has trouble with register-arg calling conventions](https://stackoverflow.com/questions/3323445/what-is-the-difference-between-asm-and-asm/35959859#comment59576185_35959859) – Peter Cordes Mar 15 '18 at 04:03