0

I'm trying to share a text file between forked processes on my Ubuntu x86_64: the file will not be absurdly large, since strings will be written only if there is not already another identical string in the file; strings will be hostnames of visited websites, so I'll assume no more than 255 bytes for each hostname.

When it is a process' turn to write in shared object, it is OK; once all the processes wrote in shared object, msync should make the writing effective on the disk, but the mapped.txt file created only contain one string from arrayString, i.e. the string the last process wrote in shared object.

Here's the code:

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <semaphore.h>
#include <string.h>

// first forked process will write "first" in file, and so on
const char *arrayString[] = {
    "first", 
    "second",
    "third"
};

int main(void) {

    int index;
    int children = 3;
    const char *filepath = "mapped.txt";
    sem_t *sem;

    sem = sem_open("semaphore", O_CREAT | O_EXCL, 0644, 1);
    sem_unlink("semaphore");
    int fd;
    fd = open(filepath, O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
        perror("open:");
        return EXIT_FAILURE;
    }

    char *data;
    data = (char *)mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (data == MAP_FAILED) {
        close(fd);
        perror("mmap:");
        return EXIT_FAILURE;
    }

    for (index=0; index<children; index++) {
        if (fork() == 0) {
            sem_wait(sem);

            size_t textsize = strlen(arrayString[index])+1;

            if (ftruncate(fd, sizeof(textsize)) == -1) {
                perror("ftruncate:");
                return EXIT_FAILURE;
            }

            for (size_t i = 0; i < textsize; i++) {
                printf("%d Writing character %c at %zu\n", getpid(), arrayString[index][i], i);
                data[i] = arrayString[index][i];
            }

            printf("%d wrote ", getpid());
            for (size_t i = 0; i < textsize; i++) {
                printf("%c", data[i]);
            }
            printf("\n");

            if (msync(data, textsize, MS_SYNC) == -1) {
                perror("Could not sync the file to disk");
            }

            sem_post(sem);
            _exit(EXIT_SUCCESS);
        }
    }
    close(fd);

    return EXIT_SUCCESS;
}     

This is one possible output of the code above for three child processes (this is fine):

20373 Writing character s at 0
20373 Writing character e at 1
20373 Writing character c at 2
20373 Writing character o at 3
20373 Writing character n at 4
20373 Writing character d at 5
20373 Writing character  at 6
20373 wrote second
20374 Writing character t at 0
20374 Writing character h at 1
20374 Writing character i at 2
20374 Writing character r at 3
20374 Writing character d at 4
20374 Writing character  at 5
20374 wrote third
20372 Writing character f at 0
20372 Writing character i at 1
20372 Writing character r at 2
20372 Writing character s at 3
20372 Writing character t at 4
20372 Writing character  at 5
20372 wrote first

And here's the content of mapped.txt (this is bad):

first^@^@^@

I expected:

second
third
first

but all I get is only the string of the last process, with those strange symbols. I'd like to keep this file persistent in memory, but because of the I/O slowness, I'm trying to use memory mapping. Any idea why my file only contains the string written by the last process accessing the shared file?

Edit: I think I get it, it seems to work now: I hope it will be of help to someone. Compiled with g++ -g -o mapthis mapthis.cpp -lrt -pthread. Beware that some error checking are missing, like for fsync, snprintf and lseek.

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <semaphore.h>
#include <time.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>

const char *arrayString[] = {
    "www.facebook.com",
    "www.google.com",
    "www.cnn.com",
    "www.speechrepository.com",
    "www.youtube.com",
    "www.facebook.com",
    "www.google.com",
    "www.cnn.com",
    "www.speechrepository.com",
    "www.youtube.com",
    "www.facebook.com",
    "www.google.com",
    "www.cnn.com",
    "www.speechrepository.com",
    "www.youtube.com"
};

int main(void) {

    int index;
    int children = sizeof(arrayString) / sizeof(char*);;
    const char *filepath = "mapped.txt";
    sem_t *sem;
    char *data;
    struct stat filestats;

    sem = sem_open("semaphore", O_CREAT | O_EXCL, 0644, 1);
    sem_unlink("semaphore");
    int fd;
    fd = open(filepath, O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
        perror("open:");
        return EXIT_FAILURE;
    }

    if (fstat(fd, &filestats) < 0) {
        close(fd);
        perror("fstat:");
        return EXIT_FAILURE;
    }

    data = (char *)mmap(NULL, filestats.st_size ? filestats.st_size : 1, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (data == MAP_FAILED) {
        close(fd);
        perror("first map:");
        return EXIT_FAILURE;
    }

    for (index=0; index<children; index++) {
        sleep(1);
        pid_t pid = fork();
        if (pid == 0) {
            int nw = 0;
            int hostnameSize = 0;
            const size_t origsize = filestats.st_size;
            char *hostPos = NULL;
            char *numPos = NULL;
            char *backslashPos = NULL;
            char tempBuff[64];
            memset((char *)tempBuff, 0, sizeof(tempBuff));
            sem_wait(sem);
            // remap to current file size if it changed
            fstat(fd, &filestats);
            // file empty, just insert
            if (filestats.st_size == 0) {
                nw = snprintf(tempBuff, sizeof(tempBuff), "%s %010lu\n", arrayString[index], (unsigned long)time(NULL));
                write(fd, tempBuff, nw);
                fsync(fd);
            }
            else {
                // file not empty, let's look for string
                hostPos = strstr(data, arrayString[index]);
                if (hostPos) {
                    // string is already inserted, search for offset of number of seconds
                    lseek(fd, hostPos-data, SEEK_SET);
                    numPos = strchr(hostPos, ' ')+1;
                    backslashPos = strchr(numPos, '\n');
                    long unsigned before = atoi(numPos);
                    long unsigned now = (unsigned long)time(NULL);
                    long unsigned difference = now - before;
                    printf("%s visited %ld seconds ago (%ld - %ld)\n", 
                        arrayString[index], difference, now, before);
                    nw = snprintf(tempBuff, backslashPos-hostPos+1, "%s %010lu", arrayString[index], now);
                    write(fd, tempBuff, nw);
                    write(fd, "\n", 1);
                    fsync(fd);
                }
                else {
                    data = (char *)mremap(data, origsize, filestats.st_size, MREMAP_MAYMOVE);
                    if (data == MAP_FAILED) {
                        close(fd);
                        sem_post(sem);
                        perror("mmap:");
                        _exit(EXIT_FAILURE);
                    }
                    lseek(fd, 0, SEEK_END);
                    nw = snprintf(tempBuff, sizeof(tempBuff), "%s %010lu\n", arrayString[index], (unsigned long)time(NULL));
                    write(fd, tempBuff, nw);
                    fsync(fd);
                }
            }
            munmap(data, filestats.st_size);
            close(fd);
            sem_post(sem);
            _exit(EXIT_SUCCESS);
        }
        else if (pid > 0) {
            wait(NULL);
        }
    }
    munmap(data, filestats.st_size);
    close(fd);

    return EXIT_SUCCESS;
}     
Alexis Wilke
  • 19,179
  • 10
  • 84
  • 156
elmazzun
  • 1,066
  • 2
  • 18
  • 44

2 Answers2

1

This line is problematic:

if (ftruncate(fd, sizeof(textsize)) == -1) {

textsize is a size_t, and taking its sizeof is just going to get 4 or 8 (on 32 and 64 bit systems). Looks like you're on a 64 bit system, so you're unconditionally truncating the file to 8 bytes in this case before every write. The "strange symbols" are just how your editor displays NUL/zero bytes. Even if you used ftruncate(fd, textsize), you'd still truncate down to just the string you're about to write, overwriting any data other children may have written; I doubt you want to ftruncate at all here.

For continual appends from separate processes (where they can't share information about the size or offset of the data they're adding), memory mapping just doesn't make sense; why aren't you just having each of them take the lock, lseek to end of file, then call write? You could still use memory mappings for the duplicate checking (some of it without locking), it would just be a bit different. Something like this:

int main(void) {
    struct stat filestats;
    int index;
    int children = 3;
    const char *filepath = "mapped.txt";
    sem_t *sem;
    char *data;

    sem = sem_open("semaphore", O_CREAT | O_EXCL, 0644, 1);
    sem_unlink("semaphore");
    int fd;
    fd = open(filepath, O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
        perror("open:");
        return EXIT_FAILURE;
    }

    // Mostly just to ensure it's mappable, we map the current size of the file
    // If the file might already have values, and many child workers won't add
    // to it, this might save some mapping work in the children; you could
    // just map in the children when needed though
    if (fstat(fd, &filestats) != 0) {
        close(fd);
        perror("fstat:");
        return EXIT_FAILURE;
    }
    data = mmap(NULL, filestats.st_size, PROT_READ, MAP_SHARED, fd, 0);
    if (data == MAP_FAILED) {
        close(fd);
        perror("mmap:");
        return EXIT_FAILURE;
    }

    for (index=0; index<children; index++) {
        if (fork() == 0) {
            const size_t origsize = filestats.st_size;
            sem_wait(sem);

            // remap to current file size if it changed
            // If you're not on Linux, you'd just have to mmap from scratch
            // since mremap isn't standard
            fstat(fd, &filestats);
            if (origsize != filestats.st_size) {
                data = mremap(data, origsize, filestats.st_size, MREMAP_MAYMOVE);
                if (data == MAP_FAILED) {
                    close(fd);
                    sem_post(sem);
                    perror("mmap:");
                    _exit(EXIT_FAILURE);
                }
            }

            // Not safe to use strstr since mapping might not end with NUL byte
            // You'd need to workaround this, or implement a your own memstr-like function
            if (!memstr(data, arrayString[index])) {
                // Move fd to end of file, so we append new data
                lseek(fd, 0, SEEK_END);
                write(fd, arrayString[index], strlen(arrayString[index]));
                write(fd, "\n", 1);
                fsync(fd);
            }
            munmap(data, filestats.st_size);
            close(fd);
            sem_post(sem);
            _exit(EXIT_SUCCESS);
        }
    }
    munmap(data, filestats.st_size);
    close(fd);

    return EXIT_SUCCESS;
}

That memstr I referenced would need to be hand-implemented (or you'd need to do terrible things like ensure the file always had a NUL byte at the end so you could use strstr on it); you can get some tips on that here.

Community
  • 1
  • 1
ShadowRanger
  • 143,180
  • 12
  • 188
  • 271
  • Your answer surely deserves a test, let me try it on my code. – elmazzun Sep 01 '16 at 09:08
  • Turned out your answer is the best: your call to `mmap` had `invalid argument` because I passed it `0` as `size_t length`, so first time I passed it `getpagesize()`; I used `snprint` to write in a temporary buffer and then `write` in `data` the number of bytes written in this buffer; now I can see if a string was already inserted in shared file. Still having difficulties in updating string: the real structure is `hostname time_t` and even if I can get `time_t` seconds, I need to write the new `time_t` seconds of the last visit, but that is another matter. – elmazzun Sep 01 '16 at 17:10
  • 1
    @elmazzun: Ah, yeah, forgot about length `0` being verboten on some OSes; you could just as easily use a length of `filestats.st_size ? filestats.st_size : 1` (which will map a whole page anyway, since that's how `mmap` operates, and avoids needing to check the real page size). – ShadowRanger Sep 01 '16 at 17:13
  • Just one more thing: every row in my file is `string int`; if `string` is already inserted take `int`, compute the difference between `time_t now = time(NULL)` and `int` field, update the difference inserting `time_t now` in `int` field for future visits and differences; when I write the new `time_t` value in `int` field, do I need to `mremap` again just for updating the `int` field? – elmazzun Sep 01 '16 at 17:30
  • @elmazzun: Depends how it's stored. If it's a string timestamp padded to a fixed width, or a raw binary timestamp, then no remapping would be required; the size never changes. I'd strongly recommend one of those, because the alternative (non-fixed width timestamps) would require annoying and error prone work any time the length of the timestamp changes, to move all of the data following it appropriately. – ShadowRanger Sep 01 '16 at 18:15
  • My fields are `string ; long unsigned int`, because of the cast I made `(unsigned long)time(NULL)`. For example, a row may be `www.google.com 1472755105`: is this one of the two alternatives you are talking about? – elmazzun Sep 01 '16 at 18:39
  • 1
    @elmazzun: You'd need to make sure it was printed with a format code like `%010ld` if times might fall into spans before early Sept. 2001, but not after mid-November 2286 (and yes, times before Sept. 2001 should be handled properly; sometimes computer clocks go awry, and you don't want to corrupt the file if that happens). If you want to handle beyond 2286, use `%011ld`. – ShadowRanger Sep 01 '16 at 23:51
  • I'd like to share my code, if anyone in future will have my problem. I just hope it is correct, but it seems to work! – elmazzun Sep 02 '16 at 13:43
0

You're writing all the strings at offset 0 of the file, each over the top of the previous. The core of your loop should be something like

struct stat status;
fstat(fd, &status);
size_t cursize = status.st_size;
ftruncate(fd, cursize + textsize);
for (size_t i = 0; i < textsize; i++) {
    data[cursize + i] = arrayString[index][i];
}
Chris Dodd
  • 119,907
  • 13
  • 134
  • 226
  • This loop will write `first^@second^@third^@` in shared file; if I add `\n` at each string in `arrayString`, and calculate `size_t textsize` without `+1`, the file will be perfectly formatted, one string each row; string formatting is important to me, since I'll need to parse strings in this file. – elmazzun Aug 31 '16 at 16:58
  • @elmazzun: As implemented, the mapping won't be large enough if your file ever exceeds the page size (the amount you `mmap`ed). See [my answer](http://stackoverflow.com/a/39254749/364696) for handling this (it means using `mremap` if on Linux, or just deferring the `mmap` step until the child holds the semaphore on other OSes). – ShadowRanger Aug 31 '16 at 17:09