3

I was writing code to read unsigned integers from a binary file using C/C++ on a 32 bit Linux OS intended to run on an 8-core x86 system. The application takes an input file which contains unsigned integers in little-endian format one after another. So the input file size in bytes is a multiple of 4. The file could have a billion integers in it. What is the fastest way to read and add all the integers and return the sum with 64 bit precision?

Below is my implementation. Error checking for corrupt data is not the major concern here and the input file is considered to without any issues in this case.

#include <iostream>
#include <fstream>
#include <pthread.h>
#include <string>
#include <string.h>


using namespace std;

string filepath;
unsigned int READBLOCKSIZE = 1024*1024;
unsigned long long nFileLength = 0;

unsigned long long accumulator = 0; // assuming 32 bit OS running on X86-64
unsigned int seekIndex[8] = {};
unsigned int threadBlockSize = 0; 
unsigned long long acc[8] = {};

pthread_t thread[8];
void* threadFunc(void* pThreadNum);

//time_t seconds1;
//time_t seconds2;

int main(int argc, char *argv[])
{
    if (argc < 2) 
    {
        cout << "Please enter a file path\n";
        return -1;
    }

    //seconds1 = time (NULL);
    //cout << "Start Time in seconds since January 1, 1970 -> " << seconds1 << "\n";

    string path(argv[1]);
    filepath = path;
    ifstream ifsReadFile(filepath.c_str(), ifstream::binary);  // Create FileStream for the file to be read
    if(0 == ifsReadFile.is_open()) 
    {
        cout << "Could not find/open input file\n";
        return -1;
    }

    ifsReadFile.seekg (0, ios::end);
    nFileLength = ifsReadFile.tellg();           // get file size
    ifsReadFile.seekg (0, ios::beg);



    if(nFileLength < 16*READBLOCKSIZE)
    {
        //cout << "Using One Thread\n"; //**
        char* readBuf = new char[READBLOCKSIZE];
        if(0 == readBuf) return -1;

        unsigned int startOffset = 0;   
        if(nFileLength >  READBLOCKSIZE)
        {
            while(startOffset + READBLOCKSIZE < nFileLength)
            {
                //ifsReadFile.flush();
                ifsReadFile.read(readBuf, READBLOCKSIZE);  // At this point ifsReadFile is open
                int* num = reinterpret_cast<int*>(readBuf);
                for(unsigned int i = 0 ; i < (READBLOCKSIZE/4) ; i++) 
                {
                    accumulator += *(num + i);  
                }
                startOffset += READBLOCKSIZE;
            }

        }

        if(nFileLength - (startOffset) > 0)
        {
            ifsReadFile.read(readBuf, nFileLength - (startOffset));  
            int* num = reinterpret_cast<int*>(readBuf);
            for(unsigned int i = 0 ; i < ((nFileLength - startOffset)/4) ; ++i) 
            {
                accumulator += *(num + i);  
            }
        }
        delete[] readBuf; readBuf = 0;
    }
    else
    {
        //cout << "Using 8 Threads\n"; //**
        unsigned int currthreadnum[8] = {0,1,2,3,4,5,6,7};
        if(nFileLength > 200000000) READBLOCKSIZE *= 16; // read larger blocks
        //cout << "Read Block Size -> " << READBLOCKSIZE << "\n";       

        if(nFileLength % 28)
        {
            threadBlockSize = (nFileLength / 28);
            threadBlockSize *= 4;
        }
        else
        {   
            threadBlockSize = (nFileLength / 7);
        }

        for(int i = 0; i < 8 ; ++i)
        {
            seekIndex[i] = i*threadBlockSize;
            //cout << seekIndex[i] << "\n";
        }
        pthread_create(&thread[0], NULL, threadFunc, (void*)(currthreadnum + 0));
        pthread_create(&thread[1], NULL, threadFunc, (void*)(currthreadnum + 1));
        pthread_create(&thread[2], NULL, threadFunc, (void*)(currthreadnum + 2));
        pthread_create(&thread[3], NULL, threadFunc, (void*)(currthreadnum + 3));
        pthread_create(&thread[4], NULL, threadFunc, (void*)(currthreadnum + 4));
        pthread_create(&thread[5], NULL, threadFunc, (void*)(currthreadnum + 5));
        pthread_create(&thread[6], NULL, threadFunc, (void*)(currthreadnum + 6));
        pthread_create(&thread[7], NULL, threadFunc, (void*)(currthreadnum + 7));

        pthread_join(thread[0], NULL);
        pthread_join(thread[1], NULL);
        pthread_join(thread[2], NULL);
        pthread_join(thread[3], NULL);
        pthread_join(thread[4], NULL);
        pthread_join(thread[5], NULL);
        pthread_join(thread[6], NULL);
        pthread_join(thread[7], NULL);

        for(int i = 0; i < 8; ++i)
        {
            accumulator += acc[i];
        }
    }

    //seconds2 = time (NULL);
    //cout << "End Time in seconds since January 1, 1970 -> " << seconds2 << "\n";
    //cout << "Total time to add " << nFileLength/4 << " integers -> " << seconds2 - seconds1 << " seconds\n";

    cout << accumulator << "\n";      
    return 0;
}

void* threadFunc(void* pThreadNum)
{
    unsigned int threadNum = *reinterpret_cast<int*>(pThreadNum);
    char* localReadBuf = new char[READBLOCKSIZE];
    unsigned int startOffset = seekIndex[threadNum];
    ifstream ifs(filepath.c_str(), ifstream::binary);  // Create FileStream for the file to be read
    if(0 == ifs.is_open()) 
    {
        cout << "Could not find/open input file\n";
        return 0;
    }   
    ifs.seekg (startOffset, ios::beg); // Seek to the correct offset for this thread
    acc[threadNum] = 0;
    unsigned int endOffset = startOffset + threadBlockSize;
    if(endOffset > nFileLength) endOffset = nFileLength; // for last thread
    //cout << threadNum << "-" << startOffset << "-" << endOffset << "\n"; 
    if((endOffset - startOffset) >  READBLOCKSIZE)
    {
        while(startOffset + READBLOCKSIZE < endOffset)
        {
            ifs.read(localReadBuf, READBLOCKSIZE);  // At this point ifs is open
            int* num = reinterpret_cast<int*>(localReadBuf);
            for(unsigned int i = 0 ; i < (READBLOCKSIZE/4) ; i++) 
            {
                acc[threadNum] += *(num + i);   
            }
            startOffset += READBLOCKSIZE;
        }   
    }

    if(endOffset - startOffset > 0)
    {
        ifs.read(localReadBuf, endOffset - startOffset);
        int* num = reinterpret_cast<int*>(localReadBuf);
        for(unsigned int i = 0 ; i < ((endOffset - startOffset)/4) ; ++i) 
        {
            acc[threadNum] += *(num + i);   
        }
    }

    //cout << "Thread " << threadNum + 1 << " subsum = " << acc[threadNum] << "\n"; //**
    delete[] localReadBuf; localReadBuf = 0;
    return 0;
}

I wrote a small C# program to generate the input binary file for testing.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;

namespace BinaryNumWriter
{
    class Program
    {
        static UInt64 total = 0;
        static void Main(string[] args)
        {
            BinaryWriter bw = new BinaryWriter(File.Open("test.txt", FileMode.Create));
            Random rn = new Random();
            for (UInt32 i = 1; i <= 500000000; ++i)
            {
                UInt32 num = (UInt32)rn.Next(0, 0xffff);
                bw.Write(num);
                total += num;
            }
            bw.Flush();
            bw.Close();
        }
    }
}

Running the program on a Core i5 machine @ 3.33 Ghz (its quad-core but its what I got at the moment) with 2 GB RAM and Ubuntu 9.10 32 bit had the following performance numbers

100 integers ~ 0 seconds (I would really have to suck otherwise) 100000 integers < 0 seconds 100000000 integers ~ 7 seconds 500000000 integers ~ 29 seconds (1.86 GB input file)

I am not sure if the HDD is 5400RPM or 7200RPM. I tried different buffer sizes for reading and found reading 16 MB at a time for big input files was kinda the sweet spot.

Are there any better ways to read faster from the file to increase overall performance? Is there a smarter way to add large arrays of integers faster and folding repeatedly? Is there any major roadblocks to performance the way I have written the code / Am I doing something obviously wrong that's costing a lot of time?

What can I do to make this process of reading and adding data faster?

Thanks.

Chinmay

Chinmay Nerurkar
  • 495
  • 6
  • 22
  • 2
    your process might be IO-bound. Compare it with a single-threaded implementation that uses fread() to read blocks. – jfs Jun 08 '12 at 17:42
  • 1
    Split the input data into 8 different files and place each file in a different `physical` hard drive and you may see some improvement. But other wise I think your CPU can read and add those numbers significantly faster than your hard drive can provide the data. But with so few number I would just do in in a single thread. – Martin York Jun 08 '12 at 19:12
  • @J.F.Sebastian - I forced my program to run through the 500000000 integers using the single threaded part of the loop and it completed calculating the sum in 27 seconds. That's 2 seconds faster than the 8 threaded implementation. I wasn't expecting that. Also isn't fstream.write() the C++ equivalent of fread()? Or is there a benefit to using fread()? – Chinmay Nerurkar Jun 08 '12 at 20:28
  • @LokiAstari - I use a single threaded implementation for small file sizes and 8 threads for larger sizes. But I guess having the whole 1.86GB file on one hard-drive seems to erode any benefit of multi-threading. Is that why the multi-threaded implementation took 29 seconds while the single threaded implementation finished in 27 seconds? – Chinmay Nerurkar Jun 08 '12 at 20:32
  • 1
    @ChinmayNerurkar: C++ IO requires special configuration to be as fast as stdio e.g., http://stackoverflow.com/questions/9371238/why-is-reading-lines-from-stdin-much-slower-in-c-than-python You could also try posix_fadvice() with POSIX_FADV_SEQUENTIAL, POSIX_FADV_WILLNEED – jfs Jun 08 '12 at 20:42
  • @J.F.Sebastian I replaced the 'fstream' code with C fread() operations and got some improvement. Running with 8 threads on 500000000 integers(1.86 GB) 23 or 29 seconds. Whats weird is that every time I recompile the code I get same timing consistently on repeated runs. So compiling the code and running multiple times gives me 29 seconds on all consecutive runs and then recompiling and running any number of times gives gives 23 seconds. 8 Threads in both cases. Its as if compiler compiles to a difference code everytime. How does that make sense? – Chinmay Nerurkar Jun 11 '12 at 18:21
  • @J.F.Sebastian - Running on the same 1.86GB file using 1 thread and fread() takes between 19-24 seconds. Even the single threaded case has this symptom of showing consistent run-times on consecutive runs between two compiles but different run-times between 2 compiles. I would be inclined to believe that compiler optimization is deterministic and produces same assembly code for the same C file everytime. Is it something to do with the way the HDD works? I also tried using 'ifstream.sync_with_stdio(false)' in the C++ version of the code and it didn't help. Did I miss something? – Chinmay Nerurkar Jun 11 '12 at 18:32
  • sync_with_stdio makes sence only for standard streams e.g., stdin. Clear the file system cache before running the tests e.g., `sync && echo 3 > /proc/sys/vm/drop_caches` – jfs Jun 11 '12 at 20:53

2 Answers2

3

Accessing a mechanical HDD from multiple threads the way you do is going to take some head movement (read slow it down). You're almost surely IO bound (65MBps for the 1.86GB file). Try to change your strategy by:

  • starting the 8 threads - we'll call them CONSUMERS
  • the 8 threads will wait for data to be made available
  • in the main thread start to read chunks (say 256KB) of the file thus being a PROVIDER for the CONSUMERS
  • main thread hits the EOF and signals the workers that there is no more avail data
  • main thread waits for the 8 workers to join.

You'll need quite a bit of synchronization to get it working flawlessly and I think it would totally max out your HDD / filesystem IO capabilities by doing sequencial file access. YMMV on smallish files which can be cached and served from the cache at lightning speed.

Another thing you can try is to start only 7 threads, leave one free CPU for the main thread & the rest of the system.

.. or get an SSD :)

Edit:

For simplicity see how fast you can simply read the file (discarding the buffers) with no processing, single-threaded. That plus epsilon is your theoretical limit to how fast you can get this done.

  • I was avoiding the Provider-Consumer pattern as consumers would have to wait on the provider and this synchronization would cost time. Instead I split the file into non-overlapping parts so that the threads would not need synchronization. Strangely using the single-threaded approach gave me 2 seconds better performance than the 8 threaded approach. Is there something I am missing by not using the Provider-Consumer pattern? I agree though about the bottleneck due to the read speed of the HDD. I am going to try your suggestion about just reading and discarding buffers and timing the process. – Chinmay Nerurkar Jun 08 '12 at 20:50
  • 1
    When you're reading off the same HDD simultaneously from 8 threads, each read takes 14-11ms to realign heads (worst case). For 1.86GB that means 1.6s of wasted time (1.86GB/16MB*14ms) which is exactly what you're seeing. Provider-consumer would probably practically be worse than single-threaded since the number crunching is so light in your case but would be a pattern to follow if instead of addition you do factorial or tangent. Multiple threads are only useful when you have a CPU-bound process and a multi-core CPU. It's not the case here. – Cristian Niculescu Jun 09 '12 at 04:35
  • That's definitely a helpful thing to know about HDD head realigning time. I commented out the for loops that add the read numbers and it still took 29 seconds just to read through the 1.86GB file using 8 threads. That is unusually long just to read the file. Also I do have a quad code CPU. How does it 'not the case here'? Is it because each thread is reading sequentially as it needs to realign the heads? – Chinmay Nerurkar Jun 11 '12 at 17:16
3

If you want to read (or write) a lot of data fast, and you don't want to do much processing with that data, you need to avoid extra copies of the data between buffers. That means you want to avoid fstream or FILE abstractions (as they introduce an extra buffer that needs to be copied through), and avoid read/write type calls that copy stuff between kernel and user buffers.

Instead, on linux, you want to use mmap(2). On a 64-bit OS, just mmap the entire file into memory, use madvise(MADV_SEQUENTIAL) to tell the kernel you're going to be accessing it mostly sequentially, and have at it. For a 32-bit OS, you'll need to mmap in chunks, unmapping the previous chunk each time. Something much like your current structure, with each thread mmapping one fixed-size chunk at a time should work well.

Chris Dodd
  • 119,907
  • 13
  • 134
  • 226
  • That definitely sounds interesting and useful. I will try that out in the evening. How big can the chunks be on a 32 bit system to be able to map them? Thanks. – Chinmay Nerurkar Jun 08 '12 at 20:44
  • I tried mapping a small 400KB file using mmap(). I get 0 as the result of the program. I have pasted the code on [Pastebin link](http://pastebin.com/zrrpVdhd) as I could not post here. Could you tell me what I am doing wrong? Thanks. – Chinmay Nerurkar Jun 12 '12 at 18:15