Mapping non-contiguous blocks from a file into contiguous memory addresses

Question

I am interested in the prospect of using memory mapped IO, preferably exploiting the facilities in boost::interprocess for cross-platform support, to map non-contiguous system-page-size blocks in a file into a contiguous address space in memory.

A simplified concrete scenario:

I've a number of 'plain-old-data' structures, each of a fixed length (less than the system page size.) These structures are concatenated into a (very long) stream with the type & location of structures determined by the values of those structures that proceed them in the stream. I'm aiming to minimize latency and maximize throughput in a demanding concurrent environment.

I can read this data very effectively by memory-mapping it in blocks of at least twice the system-page-size... and establishing a new mapping immediately having read a structure extending beyond the penultimate system-page-boundary. This allows the code that interacts with the plain-old-data structures to be blissfully unaware that these structures are memory mapped... and, for example, could compare two different structures using memcmp() directly without having to care about page boundaries.

Where things get interesting is with respect to updating these data streams... while they're being (concurrently) read. The strategy I'd like to use is inspired by 'Copy On Write' on a system-page-size granularity... essentially writing 'overlay-pages' - allowing one process to read the old data while another reads the updated data.

While managing which overlay pages to use, and when, isn't necessarily trivial... that's not my main concern. My main concern is that I may have a structure spanning pages 4 and 5, then update a structure wholly contained in page 5... writing the new page in location 6... leaving page 5 to be 'garbage collected' when it is determined to be no-longer reachable. This means that, if I map page 4 into location M, I need to map page 6 into memory location M+page_size... in order to be able to reliably process structures that cross page boundaries using existing (non-memory-mapping-aware) functions.

I'm trying to establish the best strategy, and I'm hampered by documentation I feel is incomplete. Essentially, I need to decouple allocation of address space from memory mapping into that address space. With mmap(), I'm aware that I can use MAP_FIXED - if I wish to explicitly control the mapping location... but I'm unclear how I should reserve address space in order to do this safely. Can I map /dev/zero for two pages without MAP_FIXED, then use MAP_FIXED twice to map two pages into that allocated space at explicit VM addresses? If so, should I call munmap() three times too? Will it leak resources and/or have any other untoward overhead? To make the issue even more complex, I'd like comparable behaviour on Windows... is there any way to do this? Are there neat solutions if I were to compromise my cross-platform ambitions?

--

Thanks for your answer, Mahmoud... I've read, and think I've understood that code... I've compiled it under Linux and it behaves as you suggest.

My main concerns are with line 62 - using MAP_FIXED. It makes some assumptions about mmap, which I've been unable to confirm when I read the documentation I can find. You're mapping the 'update' page into the same address space as mmap() returned initially - I assume that this is 'correct' - i.e. not something that just happens to work on Linux? I'd also need to assume that it works cross-platform for file-mappings as well as anonymous mappings.

The sample definitely moves me forwards... documenting that what I ultimately need is probably achievable with mmap() on Linux - at least. What I'd really like is a pointer to documentation that shows that the MAP_FIXED line will work as the sample demonstrates... and, idealy, a transformation from the Linux/Unix specific mmap() to a platform independent (Boost::interprocess) approach.

Mahmoud Al-Qudsi · Accepted Answer · 2012-05-13T17:11:07.133

Your question is a little confusing. From what I understood, this code will do what you need:

#define PAGESIZE 4096

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <errno.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>

struct StoredObject
{
    int IntVal;
    char StrVal[25];
};

int main(int argc, char **argv)
{
    int fd = open("mmapfile", O_RDWR | O_CREAT | O_TRUNC, (mode_t) 0600);
    //Set the file to the size of our data (2 pages)
    lseek(fd, PAGESIZE*2 - 1, SEEK_SET);
    write(fd, "", 1); //The final byte

    unsigned char *mapPtr = (unsigned char *) mmap(0, PAGESIZE * 2, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

    struct StoredObject controlObject;
    controlObject.IntVal = 12;
    strcpy(controlObject.StrVal, "Mary had a little lamb.\n");

    struct StoredObject *mary1;
    mary1 = (struct StoredObject *)(mapPtr + PAGESIZE - 4); //Will fall on the boundary between first and second page
    memcpy(mary1, &controlObject, sizeof(StoredObject));

    printf("%d, %s", mary1->IntVal, mary1->StrVal);
    //Should print "12, Mary had a little lamb.\n"

    struct StoredObject *john1;
    john1 = mary1 + 1; //Comes immediately after mary1 in memory; will start and end in the second page
    memcpy(john1, &controlObject, sizeof(StoredObject));

    john1->IntVal = 42;
    strcpy(john1->StrVal, "John had a little lamb.\n");

    printf("%d, %s", john1->IntVal, john1->StrVal);
    //Should print "12, Mary had a little lamb.\n"

    //Make sure the data's on the disk, as this is the initial, "read-only" data
    msync(mapPtr, PAGESIZE * 2, MS_SYNC);

    //This is the inital data set, now in memory, loaded across two pages
    //At this point, someone could be reading from there. We don't know or care.
    //We want to modify john1, but don't want to write over the existing data
    //Easy as pie.

    //This is the shadow map. COW-like optimization will take place: 
    //we'll map the entire address space from the shared source, then overlap with a new map to modify
    //This is mapped anywhere, letting the system decide what address we'll be using for the new data pointer
    unsigned char *mapPtr2 = (unsigned char *) mmap(0, PAGESIZE * 2, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

    //Map the second page on top of the first mapping; this is the one that we're modifying. It is *not* backed by disk
    unsigned char *temp = (unsigned char *) mmap(mapPtr2 + PAGESIZE, PAGESIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED | MAP_ANON, 0, 0);
    if (temp == MAP_FAILED)
    {
        printf("Fixed map failed. %s", strerror(errno));
    }
    assert(temp == mapPtr2 + PAGESIZE);

    //Make a copy of the old data that will later be changed
    memcpy(mapPtr2 + PAGESIZE, mapPtr + PAGESIZE, PAGESIZE);

    //The two address spaces should still be identical until this point
    assert(memcmp(mapPtr, mapPtr2, PAGESIZE * 2) == 0);

    //We can now make our changes to the second page as needed
    struct StoredObject *mary2 = (struct StoredObject *)(((unsigned char *)mary1 - mapPtr) + mapPtr2);
    struct StoredObject *john2 = (struct StoredObject *)(((unsigned char *)john1 - mapPtr) + mapPtr2);

    john2->IntVal = 52;
    strcpy(john2->StrVal, "Mike had a little lamb.\n");

    //Test that everything worked OK
    assert(memcmp(mary1, mary2, sizeof(struct StoredObject)) == 0);
    printf("%d, %s", john2->IntVal, john2->StrVal);
    //Should print "52, Mike had a little lamb.\n"

    //Now assume our garbage collection routine has detected that no one is using the original copy of the data
    munmap(mapPtr, PAGESIZE * 2);

    mapPtr = mapPtr2;

    //Now we're done with all our work and want to completely clean up
    munmap(mapPtr2, PAGESIZE * 2);

    close(fd);

    return 0;
}

My modified answer should address your safety concerns. Only use MAP_FIXED on the second mmap call (like I have above). The cool thing about MAP_FIXED is that it lets you overwrite an existing mmap address section. It'll unload the range you're overlapping and replace it with your new mapped content:

 MAP_FIXED
              [...] If the memory
              region specified by addr and len overlaps pages of any existing
              mapping(s), then the overlapped part of the existing mapping(s) will be
              discarded. [...]

This way, you let the OS take care of finding a contiguous memory block of hundreds of megs for you (never call MAP_FIXED on address you don't know for sure isn't available). Then you call MAP_FIXED on a subsection of that now-mapped huge space with the data that you will be modifying. Tada.

On Windows, something like this should work (I'm on a Mac at the moment, so untested):

int main(int argc, char **argv)
{
    HANDLE hFile = CreateFile(L"mmapfile", GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
    //Set the file to the size of our data (2 pages)
    SetFilePointer(hFile, PAGESIZE*2 - 1, 0, FILE_BEGIN);
    DWORD bytesWritten = -1;
    WriteFile(hFile, "", 1, &bytesWritten, NULL);

    HANDLE hMap = CreateFileMapping(hFile, NULL, PAGE_READWRITE, 0, PAGESIZE * 2, NULL);
    unsigned char *mapPtr = (unsigned char *) MapViewOfFile(hMap, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, PAGESIZE * 2);

    struct StoredObject controlObject;
    controlObject.IntVal = 12;
    strcpy(controlObject.StrVal, "Mary had a little lamb.\n");

    struct StoredObject *mary1;
    mary1 = (struct StoredObject *)(mapPtr + PAGESIZE - 4); //Will fall on the boundary between first and second page
    memcpy(mary1, &controlObject, sizeof(StoredObject));

    printf("%d, %s", mary1->IntVal, mary1->StrVal);
    //Should print "12, Mary had a little lamb.\n"

    struct StoredObject *john1;
    john1 = mary1 + 1; //Comes immediately after mary1 in memory; will start and end in the second page
    memcpy(john1, &controlObject, sizeof(StoredObject));

    john1->IntVal = 42;
    strcpy(john1->StrVal, "John had a little lamb.\n");

    printf("%d, %s", john1->IntVal, john1->StrVal);
    //Should print "12, Mary had a little lamb.\n"

    //Make sure the data's on the disk, as this is the initial, "read-only" data
    //msync(mapPtr, PAGESIZE * 2, MS_SYNC);

    //This is the inital data set, now in memory, loaded across two pages
    //At this point, someone could be reading from there. We don't know or care.
    //We want to modify john1, but don't want to write over the existing data
    //Easy as pie.

    //This is the shadow map. COW-like optimization will take place: 
    //we'll map the entire address space from the shared source, then overlap with a new map to modify
    //This is mapped anywhere, letting the system decide what address we'll be using for the new data pointer
    unsigned char *reservedMem = (unsigned char *) VirtualAlloc(NULL, PAGESIZE * 2, MEM_RESERVE, PAGE_READWRITE);
    HANDLE hMap2 = CreateFileMapping(hFile, NULL, PAGE_READWRITE, 0, PAGESIZE, NULL);
    unsigned char *mapPtr2 = (unsigned char *) MapViewOfFileEx(hMap2, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, PAGESIZE, reservedMem);

    //Map the second page on top of the first mapping; this is the one that we're modifying. It is *not* backed by disk
    unsigned char *temp = (unsigned char *) MapViewOfFileEx(hMap2, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, PAGESIZE, reservedMem + PAGESIZE);
    if (temp == NULL)
    {
        printf("Fixed map failed. 0x%x\n", GetLastError());
        return -1;
    }
    assert(temp == mapPtr2 + PAGESIZE);

    //Make a copy of the old data that will later be changed
    memcpy(mapPtr2 + PAGESIZE, mapPtr + PAGESIZE, PAGESIZE);

    //The two address spaces should still be identical until this point
    assert(memcmp(mapPtr, mapPtr2, PAGESIZE * 2) == 0);

    //We can now make our changes to the second page as needed
    struct StoredObject *mary2 = (struct StoredObject *)(((unsigned char *)mary1 - mapPtr) + mapPtr2);
    struct StoredObject *john2 = (struct StoredObject *)(((unsigned char *)john1 - mapPtr) + mapPtr2);

    john2->IntVal = 52;
    strcpy(john2->StrVal, "Mike had a little lamb.\n");

    //Test that everything worked OK
    assert(memcmp(mary1, mary2, sizeof(struct StoredObject)) == 0);
    printf("%d, %s", john2->IntVal, john2->StrVal);
    //Should print "52, Mike had a little lamb.\n"

    //Now assume our garbage collection routine has detected that no one is using the original copy of the data
    //munmap(mapPtr, PAGESIZE * 2);

    mapPtr = mapPtr2;

    //Now we're done with all our work and want to completely clean up
    //munmap(mapPtr2, PAGESIZE * 2);

    //close(fd);

    return 0;
}

Thanks, this is helpful. Ideally, I'd like a pointer to documentation that suggests that it is safe/appropriate to use MAP_FIXED in this way - and also to establish if a cross-platform approach using boost is feasible. — aSteve, May 07 '12 at 18:40
That's what I didn't understand about your question. This is what `MAP_FIXED` is *made* for. Now the correct way of using it *if* you're talking about hundreds of MiB of address space would be to use fixed addressing for *all* mappings, otherwise letting the system choose a random location is good enough. — Mahmoud Al-Qudsi, May 07 '12 at 20:43
I knew MAP_FIXED was relevant (on Linux, MapViewOfFileEx on Windows) though I was unnerved by "Because requiring a fixed address for a mapping is less portable, the use of this option is discouraged." in my linux mmap man-page. I do expect to be mapping hundreds of MiB (at least) - but I'm unclear how best to reserve address space to allocate it myself - if not through mmap(). — aSteve, May 10 '12 at 20:11
This is C code; not C++. You should use RAII (e.g., scopeGuard) to ensure cleanup. — Jonathan Leonard, May 10 '12 at 20:59
Thanks for the update... For Unix/Linux I'm liking this approach... though it hints that I need to write my own memory mapping library - rather than use the one in Boost::interprocess. I can do that - if necessary. If I do have to write my own platform independent mapping library, I'd like it to take account of both Windows and Unix. As far as I can tell, this desirable mmap/MAP_FIXED behaviour is not mimicked by Windows' equivalent calls to MapViewOfFileEx()... though, perhaps, VirtualAlloc() with MEM_RESERVE might provide equivalent facilities? — aSteve, May 13 '12 at 12:58
Yes, what you'll do is replace the first mmap call in the second half of the code with a `VirtualAlloc(MEM_RESERVE)` which will tell the OS to find you a contiguous block of x bytes that it will set aside *but not physically allocate at this point*. Then you can `MapViewOfFileEx` on top of the *returned* address using the final `lpBaseAddress` parameter hint. While MS says this isn't guaranteed to work on its own, with `VirtualAlloc` it becomes guaranteed (they complement one another). — Mahmoud Al-Qudsi, May 13 '12 at 16:53
@MahmoudAl-Qudsi 'Sample code' can easily find its way copy/pasted into a 'library' or application code. The question is tagged as c++ so one would expect the answers to use best practices for that language. — Jonathan Leonard, May 13 '12 at 19:21
@Jonathan I'm not going to turn every answer here into a fully documented, tested, and perfectly written open source library. — Mahmoud Al-Qudsi, May 13 '12 at 20:14
@Mahmoud Libraries are meant to be called/consumed and so can be coded however you wish [assuming correctness]. Sample code, however, is meant to be read and mimicked by readers (and many of them will just copy/paste the entire block). Also, I would argue that writing in the style of C++ should be your first instinct; not second or lower. It is much easier to compose quality code the first time around rather than translating it from C. — Jonathan Leonard, May 13 '12 at 21:52
Actually, I answered it in C (and compiled it as such), as I was using Posix instead of boost to provide a solution. That said, I think requiring absolutely ship-shape answers will end up with no one attempting replying unless they have a couple of hours to spend drafting a response for each question. At the end of the day, it's not my problem if people copy and paste code blindly without looking. — Mahmoud Al-Qudsi, May 13 '12 at 22:01
While I'm still hoping to find a Boost-oriented solution, finding one seems increasingly unlikely - in the short-term, at least. Outside this domain, establishing suitable OS API calls has been the next best thing, from my perspective. — aSteve, May 14 '12 at 07:32
After a while spent looking at other problems, I returned to this one - and, realising that I need to 'roll my own' memory mapping... I tried the Windows sample above... and found it doesn't work... The address returned by VirtualAlloc() is not accepted by MapViewOfFileEx(). This MSDN page seems to suggest the approach is not valid. http://msdn.microsoft.com/en-us/library/windows/desktop/aa366763%28v=vs.85%29.aspx — aSteve, Aug 25 '12 at 11:51
@aSteve did you find a solution? At minimum you could use virtual alloc to reserve the address, free it immediately and attempt the mapping. There's a small window for a race condition there where another thread could grab that address space, in which case you can just try all the steps again until it eventually succeeds. My reading of the docs concurs with yours: "No other memory allocation can take place in the region that is used for mapping, including the use of the VirtualAlloc or VirtualAllocEx function to reserve memory." — Eloff, Jul 08 '15 at 18:27
I didn't find a satisfactory solution, with which I was happy... I came closer under Linux than Windows. What I learned from this research influenced my overall design strategy - making this issue one of only academic interest, to me, today. — aSteve, Jul 11 '15 at 15:13

score 2 · Answer 2 · answered May 13 '12 at 04:55

2

but I'm unclear how I should reserve address space in order to do this safely

That's going to vary by OS, but a little digging on msdn for mmap (I started with "xp mmap" on the msdn search) shows Microsoft have their usual VerboseAndHelpfullyCapitalizedNames for (the many) functions that implement pieces of mmap. Both the file- and anonymous- mappers can handle fixed-address requests just the same as any POSIX-2001 system can, i.e. if something else in your address space is talking to the kernel, you get to sort it out. No way I'm going to touch "safely", there's no such thing as "safely" with code you're wanting to port to unspecified platforms. You're going to have to build your own pool of pre-mapped anonymous memory that you can unmap and parcel out later under your own control.

answered May 13 '12 at 04:55

jthill

55,082
5
77
137

I've found VirtualAlloc() with MEM_RESERVE on Windows - though I've not experimented with it extensively. Any concrete documentation detailing how to reserve memory in order to map to explicit addresses would be helpful. I concur that the big problem here is safety and portability. This is what has drawn me to Boost::interprocess - which has an object that allows me to specify explicit addresses... but doesn't document how to establish valid explicit addresses... and the 'overlay mappings' strategy from Mahmoud doesn't work - either on Linux or Windows. – aSteve May 13 '12 at 09:22
(Clarification: Mahmoud's strategy works as per his sample - but the technique doesn't work when using only the portable boost::interprocess library.) – aSteve May 13 '12 at 10:22
@aSteve "but doesn't document how how to establish valid explicit addresses" ... that's because it depends on what else is going on in your address space, whether there are any adminstrative limits on address space size (`ulimit`s), ... Start simple. mmap a huge swathe MAP_ANONYMOUS|MAP_NORESERVE and then when you want a piece of it, unmap the whole thing, MAP_FIXED the piece you want and re-M_A|M_N the rest. – jthill May 13 '12 at 16:59

score 0 · Answer 3 · answered Jul 14 '15 at 13:41

I tested the windows code from @Mahmoud, well actually I tested the following similar code, and it doesn't work (the Linux code works.) If you uncomment VirtualFree, it will work. As mentioned in my comment above, on windows you can reserve the address space with VirtualAlloc, but you can't use MapViewOfFileEx with an already mapped address, so you need to VirtualFree it first. Then there's a race condition where another thread can grab the memory address before you do, so you have to do everything in a loop, e.g. try up to 1000 times and then give up.

package main

import (
    "fmt"
    "os"
    "syscall"
)

func main() {
    const size = 1024 * 1024

    file, err := os.Create("foo.dat")
    if err != nil {
        panic(err)
    }

    if err := file.Truncate(size); err != nil {
        panic(err)
    }

    const MEM_COMMIT = 0x1000

    addr, err := virtualAlloc(0, size, MEM_COMMIT, protReadWrite)
    if err != nil {
        panic(err)
    }

    fd, err := syscall.CreateFileMapping(
        syscall.Handle(file.Fd()),
        nil,
        uint32(protReadWrite),
        0,
        uint32(size),
        nil,
    )

    //if err := virtualFree(addr); err != nil {
    //  panic(err)
    //}

    base, err := mapViewOfFileEx(fd, syscall.FILE_MAP_READ|syscall.FILE_MAP_WRITE, 0, 0, size, addr)
    if base == 0 {
        panic("mapViewOfFileEx returned 0")
    }
    if err != nil {
        panic(err)
    }

    fmt.Println("success!")
}

type memProtect uint32

const (
    protReadOnly  memProtect = 0x02
    protReadWrite memProtect = 0x04
    protExecute   memProtect = 0x20
    protAll       memProtect = 0x40
)

var (
    modkernel32         = syscall.MustLoadDLL("kernel32.dll")
    procMapViewOfFileEx = modkernel32.MustFindProc("MapViewOfFileEx")
    procVirtualAlloc    = modkernel32.MustFindProc("VirtualAlloc")
    procVirtualFree     = modkernel32.MustFindProc("VirtualFree")
    procVirtualProtect  = modkernel32.MustFindProc("VirtualProtect")
)

func mapViewOfFileEx(handle syscall.Handle, prot memProtect, offsetHigh uint32, offsetLow uint32, length uintptr, target uintptr) (addr uintptr, err error) {
    r0, _, e1 := syscall.Syscall6(procMapViewOfFileEx.Addr(), 6, uintptr(handle), uintptr(prot), uintptr(offsetHigh), uintptr(offsetLow), length, target)
    addr = uintptr(r0)
    if addr == 0 {
        if e1 != 0 {
            err = error(e1)
        } else {
            err = syscall.EINVAL
        }
    }
    return addr, nil
}

func virtualAlloc(addr, size uintptr, allocType uint32, prot memProtect) (mem uintptr, err error) {
    r0, _, e1 := syscall.Syscall6(procVirtualAlloc.Addr(), 4, addr, size, uintptr(allocType), uintptr(prot), 0, 0)
    mem = uintptr(r0)
    if e1 != 0 {
        return 0, error(e1)
    }
    return mem, nil
}

func virtualFree(addr uintptr) error {
    const MEM_RELEASE = 0x8000
    _, _, e1 := syscall.Syscall(procVirtualFree.Addr(), 3, addr, 0, MEM_RELEASE)
    if e1 != 0 {
        return error(e1)
    }
    return nil
}

Mapping non-contiguous blocks from a file into contiguous memory addresses

3 Answers3

Linked