How is BareMetalOS allocating memory in Assembly without malloc, brk, or mmap?

Question

As a learning experiment, I am interested in creating a hashtable in assembly (x86-64 in NASM on OSX). One of the requirements is to be able to dynamically allocate/manage memory.

After looking through many resources on how to allocate memory in assembly, most of them recommend either brk or mmap syscalls. I haven't learned exactly how these worked yet because I found another implementation of memory allocation in BareMetal-OS that doesn't use any system calls (copied their code below).

My question is, how are they doing this? Can you explain the relevant instructions in their assembly that perform the memory allocation, for someone without a systems programming background and who is new to assembly? The reason for wanting to understand how to implement memory allocation in assembly is to be able to implement a hashtable in assembly.

Being new to assembly (I mainly do JavaScript), and having not found any detailed resources yet on memory allocation in assembly, I don't know where to start. It may be obvious to you, but you have the background, which I don't. I have done some assembly the past week or two, so I understand the basics about mov on registers, and the jump commands, but don't yet understand the additional stuff they are doing to implement this memory stuff. My thinking is, if they can implement memory allocation in assembly without brk or mmap, then I want to do it that way because then I really am manipulating the memory directly without any system layers, and it seems like you can really fine-tune stuff.

Here is their code copied from GitHub:

https://github.com/ReturnInfinity/BareMetal-OS/blob/master/os/syscalls/memory.asm

# =============================================================================
# BareMetal -- a 64-bit OS written in Assembly for x86-64 systems
# Copyright (C) 2008-2014 Return Infinity -- see LICENSE.TXT
#
# Memory functions
# =============================================================================

align 16
db 'DEBUG: MEMORY   '
align 16


# -----------------------------------------------------------------------------
# os_mem_allocate -- Allocates the requested number of 2 MiB pages
#  IN:  RCX = Number of pages to allocate
# OUT:  RAX = Starting address (Set to 0 on failure)
# This function will only allocate continuous pages
os_mem_allocate:
  push rsi
  push rdx
  push rbx

  cmp rcx, 0
  je os_mem_allocate_fail   # At least 1 page must be allocated

  # Here, we'll load the last existing page of memory in RSI.
  # RAX and RSI instructions are purposefully interleaved.

  xor rax, rax
  mov rsi, os_MemoryMap   # First available memory block
  mov eax, [os_MemAmount]   # Total memory in MiB from a double-word
  mov rdx, rsi      # Keep os_MemoryMap unmodified for later in RDX         
  shr eax, 1      # Divide actual memory by 2

  sub rsi, 1
  std       # Set direction flag to backward
  add rsi, rax      # RSI now points to the last page

os_mem_allocate_start:      # Find a free page of memory, from the end.
  mov rbx, rcx      # RBX is our temporary counter

os_mem_allocate_nextpage:
  lodsb
  cmp rsi, rdx      # We have hit the start of the memory map, no more free pages
  je os_mem_allocate_fail

  cmp al, 1
  jne os_mem_allocate_start # Page is taken, start counting from scratch

  dec rbx       # We found a page! Any page left to find?
  jnz os_mem_allocate_nextpage

os_mem_allocate_mark:     # We have a suitable free series of pages. Allocate them.
  cld       # Set direction flag to forward

  xor rdi, rsi      # We swap rdi and rsi to keep rdi contents.
  xor rsi, rdi
  xor rdi, rsi

  # Instructions are purposefully swapped at some places here to avoid 
  # direct dependencies line after line.
  push rcx      # Keep RCX as is for the 'rep stosb' to come
  add rdi, 1
  mov al, 2
  mov rbx, rdi      # RBX points to the starting page
  rep stosb
  mov rdi, rsi      # Restoring RDI
  sub rbx, rdx      # RBX now contains the memory page number
  pop rcx       # Restore RCX

  # Only dependency left is between the two next lines.
  shl rbx, 21     # Quick multiply by 2097152 (2 MiB) to get the starting memory address
  mov rax, rbx      # Return the starting address in RAX
  jmp os_mem_allocate_end

os_mem_allocate_fail:
  cld       # Set direction flag to forward
  xor rax, rax      # Failure so set RAX to 0 (No pages allocated)

os_mem_allocate_end:
  pop rbx
  pop rdx
  pop rsi
  ret
# -----------------------------------------------------------------------------


# -----------------------------------------------------------------------------
# os_mem_release -- Frees the requested number of 2 MiB pages
#  IN:  RAX = Starting address
# RCX = Number of pages to free
# OUT:  RCX = Number of pages freed
os_mem_release:
  push rdi
  push rcx
  push rax

  shr rax, 21     # Quick divide by 2097152 (2 MiB) to get the starting page number
  add rax, os_MemoryMap
  mov rdi, rax
  mov al, 1
  rep stosb

  pop rax
  pop rcx
  pop rdi
  ret
# -----------------------------------------------------------------------------


# -----------------------------------------------------------------------------
# os_mem_get_free -- Returns the number of 2 MiB pages that are available
#  IN:  Nothing
# OUT:  RCX = Number of free 2 MiB pages
os_mem_get_free:
  push rsi
  push rbx
  push rax

  mov rsi, os_MemoryMap
  xor rcx, rcx
  xor rbx, rbx

os_mem_get_free_next:
  lodsb
  inc rcx
  cmp rcx, 65536
  je os_mem_get_free_end
  cmp al, 1
  jne os_mem_get_free_next
  inc rbx
  jmp os_mem_get_free_next

os_mem_get_free_end:
  mov rcx, rbx

  pop rax
  pop rbx
  pop rsi
  ret
# -----------------------------------------------------------------------------


# -----------------------------------------------------------------------------
# os_mem_copy -- Copy a number of bytes
#  IN:  RSI = Source address
# RDI = Destination address
# RCX = Number of bytes to copy
# OUT:  Nothing, all registers preserved
os_mem_copy:
  push rdi
  push rsi
  push rcx

  rep movsb     # Optimize this!

  pop rcx
  pop rsi
  pop rdi
  ret
# -----------------------------------------------------------------------------


# =============================================================================
# EOF

Also note, I have read many resources on creating hashtables in C, one of which I have copied here (which has the C code, and corresponding assembly). However, pretty much all of the C examples use malloc, which I want to avoid. I am trying to learn assembly without depending on C at all.

Also, this resource from Quora was helpful in pointing to the places in the malloc.c source code where brk and mmap are used. However, I haven't studied that yet because of discovering the BareMetal-OS memory.asm code, which seems to allocate memory without even using those syscalls. Hence the question, how are they doing that? Can you explain the relevant instructions in their assembly that perform the memory allocation?

Update

This book helps explain pretty much everything about the internals of memory below mmap and brk, it's all in the area of implementing operating systems. http://www.amazon.com/Modern-Operating-Systems-4th-Edition/dp/013359162X

How do you think **any** operating system *implements* those system calls? — Elliott Frisch, Jan 01 '15 at 00:14
Interesting an operating system named baremetal is an oxymoron. Baremetal means without operating system. When you are running bare metal without an operating system or making your own operating system. ALL of the ram/memory is yours you are the operating system the manager of those resources and you get to choose how you are going to allocate those resources. If you dont manage them they they simply sit there unused, nobody owns them... — old_timer, Jan 01 '15 at 00:20

user3386109 · Answer 1 · 2015-01-01T01:36:32.663

5

In order to manage memory, your code needs to "own" some memory. The problem is that on any machine that has an operating system, the operating system owns all of the memory. So your code has to ask the operating system for some memory, which it can do with brk, or mmap, or malloc.

So for example, if you want to write a memory manager in assembly, and you have a machine with 4GB of memory, it would not be unreasonable to request 1GB of memory from malloc at the start of the program, and then manage that memory any way you like.

The assembly code from the BareMetal-OS really doesn't apply to your situation, because BareMetal is the operating system, and therefore doesn't need to ask anyone for memory. It already owns all of the memory, and can manage it anyway it likes.

edited Jan 01 '15 at 01:36

answered Jan 01 '15 at 00:34

user3386109

34,287
7
49
68

This is helpful, basically I need to get a deeper understanding of how dynamic memory allocation is implemented ([this SO question helps](http://stackoverflow.com/questions/79923/what-and-where-are-the-stack-and-heap)). However, this doesn't really answer the question. Are you saying that the BareMetalOS code is nonsense and won't work? If not, then what is the gist of the instructions it is using to do the memory allocation/management (and how does that work, looking for a brief explanation of the more advanced instructions and the role they play in memory allocation/management)? – Lance Jan 01 '15 at 01:38
2

@LancePollard It's not nonsense, but it is not applicable to writing a memory allocator that runs in userspace on top of an operating system, as that is a very, very different environment. See e.g. [this](http://www.ibm.com/developerworks/library/l-memory/) on how to write a simple user-space memory allocator. (It's C code, which you can translate to assembly if you need to.) Regardless of you using assembly or C, you will need to ask the operating system for memory (e.g. by calling the sbrk() system call), and once you got that memory, you can allocate your own chunks from it. – nos Jan 01 '15 at 02:01

score 2 · Answer 2 · answered Jan 01 '15 at 04:19

Following on from other comments and answers, the reason BareMetal-OS can implement allocation in this manner is because it is relying on several additional function calls not present in the code posted or in general assembly compilers such as NASM, etc. Specifically, the calls relied on in the posted code are:

os_MemoryMap
os_MemAmount

They are either BareMetal-OS Specific calls or likely calls specific to some memory manager used by the person posting the code. Without some external library, (e.g libc or a memory manager lib), you are limited to the brk instruction. (45 on x86 and 12 on x86_64) Hopefully this adds another piece to the puzzle. Good luck.

Yes, `os_MemoryMap` is defined here: [os/sysvar.asm#L52](https://github.com/ReturnInfinity/BareMetal-OS/blob/b9d6919962202466f71130f46e961c57b7325347/os/sysvar.asm#L52). It's just a number tho, so it still doesn't make sense to me yet _how_ this is allocating/managing memory. Hoping to hear more about how this works, thanks tho, this does add to the puzzle. Spending more time learning about memory management too. — Lance, Jan 01 '15 at 04:26

score 2 · Accepted Answer · answered Jan 01 '15 at 09:39

This post explains the assembly code for the os_mem_allocate function. The basic idea is that memory is allocated in 2MB chunks. There's an array of 65536 bytes (os_MemoryMap) that keeps track of which chunks are free and which are used. A value of 1 is a free chunk, a value of 2 is a used chunk. The total amount of memory that could be managed is 64K * 2MB = 128GB. Since most machines don't have that much memory there's another variable (os_MemAmount) that indicates the memory size of the machine (in MB).

The input to the os_mem_allocate function is a count, i.e. how many 2MB chunks to allocate. The function is designed to only allocate contiguous chunks. For example, if the input request is 3, then the function attempts to allocate 6MB of memory, and does this by searching the array for three 1's in a row. The return value from the function is a pointer to the allocated memory, or 0 if the request could not be satisfied.

The input count is passed in rcx. The code verifies that the request is for a non-zero number of chunks. An input of 0 results in a return value of 0.

os_mem_allocate:
    push rsi                  # save some registers 
    push rdx
    push rbx

    cmp rcx, 0                # Is the count 0?
    je os_mem_allocate_fail   # If YES, then return 0

The code does a roundabout calculation to point rsi to the last usable byte in the 65536 byte array. The last two lines of the following snippet are the most interesting. Setting the direction flag means that subsequent lodsb instructions will decrement rsi. And of course pointing rsi to the last usable byte in the array is the whole point of the calculation.

    xor rax, rax
    mov rsi, os_MemoryMap   # Get the address of the 65536 byte array into RSI
    mov eax, [os_MemAmount] # Get the memory size in MB into EAX
    mov rdx, rsi            # Keep os_MemoryMap in RDX for later use        
    shr eax, 1              # Divide by 2 because os_MemAmount is in MB, but chunks are 2MB

    sub rsi, 1              # in C syntax, we're calculating &array[amount/2-1], which is the address of the last usable byte in the array
    std                     # Set direction flag to backward
    add rsi, rax            # RSI now points to the last byte

Next the code has a loop that searches for N contiguous free chunks, where N is the count that was passed to the function in rcx. The loop scans backwards through the array looking for N 1's in a row. The loop succeeds if rbx reaches 0. Any time the loop finds a 2 in the array, it resets rbx back to N.

os_mem_allocate_start:       
    mov rbx, rcx                 # RBX is the number of contiguous free chunks we need to find

os_mem_allocate_nextpage:
    lodsb                        # read a byte into AL, and decrement RSI
    cmp rsi, rdx                 # if RSI has reached the beginning of the array
    je os_mem_allocate_fail      # then the loop has failed

    cmp al, 1                    # Is the chunk free?
    jne os_mem_allocate_start    # If NO, we need to restart the count

    dec rbx                      # If YES, decrement the count 
    jnz os_mem_allocate_nextpage # If the count reaches zero we've succeeded, otherwise continue looping

At this point the code has found enough contiguous chunks to satisfy the request, so now it marks all of the chunks as "used" by setting the bytes in the array to 2. The direction flag is set to forward so that subsequent stosb instructions will increment rdi.

os_mem_allocate_mark:      # We have a suitable free series of chunks, mark them as used
    cld                    # Set direction flag to forward

    xor rdi, rsi           # We swap RDI and RSI to keep RDI contents, but
    xor rsi, rdi           # more importantly we want RDI to point to the     
    xor rdi, rsi           # location in the array where we want to write 2's

    push rcx               # Save RCX since 'rep stosb' will modify it
    add rdi, 1             # the previous loop decremented RSI too many times
    mov al, 2              # the value 2 indicates a "used" chunk
    mov rbx, rdi           # RBX is going to be used to calculate the return value
    rep stosb              # store some 2's in the array, using the count in RCX
    mov rdi, rsi           # Restoring RDI

Finally, the function needs to come up with a pointer to return to the caller.

    sub rbx, rdx           # RBX is now an index into the 65536 byte array
    pop rcx                # Restore RCX
    shl rbx, 21            # Multiply by 2MB to convert the index to a pointer
    mov rax, rbx           # Return the pointer in RAX
    jmp os_mem_allocate_end

The next snippet handles errors by setting the return value to 0. Clearing the direction flag is important since by convention the direction is forward.

os_mem_allocate_fail:
    cld               # Set direction flag to forward
    xor rax, rax      # Failure so set RAX to 0 (No pages allocated)

Finally, restore the registers and return the pointer.

os_mem_allocate_end:
   pop rbx
   pop rdx
   pop rsi
   ret

Ah this is so great! Thank you! After reading your walkthrough I feel like I have skipped many months of banging my head against the wall :). Ok so you say "pointing `rsi` to the last usable byte in the array is the whole point of the calculation". So since BareMetalOS _is_ an OS, it already _owns_ all the memory (like you were saying in the other answer). Given that, the 128GB just says the total amount available, and all this is doing is flagging chunks as "used" with `2` (I guess in some sort of lookup table, don't see it totally), so he just iterates that to find the next "unused" chunk? — Lance, Jan 01 '15 at 17:06
So basically, given you allocated xGB of memory at the start of your program with `mmap` or `brk`, you could then "manage" the memory (i.e. search for unused chunks, and point `rsi` to the last usable byte) in pretty much the same way as here? Or am I still missing something? — Lance, Jan 01 '15 at 17:07
Following the trail of `mmap` in the linux code led to this: https://github.com/torvalds/linux/blob/9a3c4145af32125c5ee39c0272662b47307a8323/arch/x86/include/asm/paravirt_types.h#L581 Now wanting to know how the OS works, there's gotta be a way to go below `mmap` even if it requires building a new OS.. — Lance, Jan 01 '15 at 17:27

How is BareMetalOS allocating memory in Assembly without malloc, brk, or mmap?

3 Answers3