What are the other options of faster IO in C

Question

I implemented merge sort and used it as a solution for this codechef problem. Here are the submissions. The code is placed below.

The problem that I think is causing slow execution is that my IO is slow in the main function. I know the number of elements that are inputted so there has to be some faster way to read input instead of the way that I am doing.

Are there faster IO methods instead of the ones that I am using in the main function? I have heard about using buffers, fgets and sscanf but I don't know if they are faster.

Any code examples would be helpful.

#include<stdio.h>
#include<stdlib.h>

void merge_parts(int arr[], int length)
{
    int *ans;
    int i, j, k;
    int temp = length/2;

    ans = malloc(sizeof(int) * length);

    //This while and next if-else puts the merged array into temporary array ans
    for (j = temp, i = k = 0; (i < temp && j < length); k++){
        ans[k] = (arr[i] < arr[j]) ? arr[i++] : arr[j++];
    }

    if(i >= temp){
        while(j < length){
            ans[k++] = arr[j++];
        }
    }
    else{
        while(i < temp){
            ans[k++] = arr[i++];
        }
    }

    //This while loops puts array ans into original array arr
    for(i = 0; i < length; i++){
        arr[i] = ans[i];
    }

    free(ans);
}

void merge_sort(int arr[], int length)
{
    if(length > 1)
    {
        merge_sort(&arr[0], (length/2));
        merge_sort(&arr[length/2], (length - length/2));
        merge_parts(arr, length);
    }
}

int main()
{
    int length;
    int *arr;
    scanf("%d", &length);
    arr = malloc(sizeof(int) * length);

    for(int i = 0; i < length; i++)
        scanf("%d", &arr[i]);

    merge_sort(arr, length);

    for(int i = 0; i < length; i++)
        printf("%d ", arr[i]);

    free(arr);
    return 0;
}

EDIT3:

[I deleted EDIT AND EDIT2 as they were no longer relevant]

merge_sort algorithm that I am using

void merge_parts(int arr[], int length)
{
    int ans[length];
    int i, j, k;
    int temp = length/2;
    //This while and next if-else puts the merged array into temporary array ans
    for (j = temp, i = k = 0; (i < temp && j < length); k++){
        ans[k] = (arr[i] < arr[j]) ? arr[i++] : arr[j++];
    }

    if(i >= temp){
        while(j < length){
            ans[k++] = arr[j++];
        }
    }
    else{
        while(i < temp){
            ans[k++] = arr[i++];
        }
    }

    //This while loops puts array ans into original array arr
    for(i = 0; i < length; i++){
        arr[i] = ans[i];
    }
}

void merge_sort(int arr[], int length)
{
    if(length > 1)
    {
        merge_sort(&arr[0], (length/2));
        merge_sort(&arr[length/2], (length - length/2));
        merge_parts(arr, length);
    }
}

merge1.c

#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>
#include<time.h>

#define SORTING_ALGO_CALL merge_sort

char buffer[4096];
int bufcount;
int bufpos;

int get_next_char()
{
    if (!bufcount)
    {
        bufcount = fread(buffer, 1, 4096, stdin);
        bufpos = 0;
        if (!bufcount){
            return EOF;
        }
    }
    bufcount--;
    return buffer[bufpos++];
}

int readnum()
{
    int res = 0;
    char ch;
    do
    {
        ch = get_next_char();
    } while (!isdigit(ch) && ch != EOF);

    if (ch == EOF){
            return 0xbaadbeef;    // Don't expect this to happen.
    }

    do
    {
        res = (res * 10) + ch - '0';
        ch = get_next_char();
    } while(isdigit(ch));
    return res;
}


int main()
{
    clock_t time1, time2;
    double time_taken;

//FIRST READ
    time1 = clock();

    int length = readnum();
    while (length < 1)
    {
        printf("\nYou entered length = %d\n", length);
        printf("\nEnter a positive length: ");
        length = readnum();
    }

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nReading length = %f\n", time_taken);
    time1 = clock();

    int *arr;
    if ((arr = malloc(sizeof(int) * length)) == NULL)
    {
        perror("The following error occurred");
        exit(-1);
    }

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nAllocating array = %f\n", time_taken);
    time1 = clock();

    for (int i = 0; i < length; i++){
        arr[i] = readnum();
    }

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nReading array = %f\n", time_taken);
    time1 = clock();

    SORTING_ALGO_CALL(arr, length);

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nSorting array = %f\n", time_taken);
    time1 = clock();
/*
    for (int i = 0; i < length; i++){
        printf("%d ", arr[i]);
    }
*/
//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nPrinting Sorted array = %f\n", time_taken);
    time1 = clock();

    free(arr);

//SECOND READ, PRINT
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nFreeing array = %f\n", time_taken);

    return 0;
}

merge2.c

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

#define SORTING_ALGO_CALL merge_sort

int main()
{
    clock_t time1, time2;
    double time_taken;

//FIRST READ
    time1 = clock();

    int length;
    scanf("%d", &length);
    while (length < 1)
    {
        printf("\nYou entered length = %d\n", length);
        printf("\nEnter a positive length: ");
        scanf("%d", &length);
    }

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nReading length = %f\n", time_taken);
    time1 = clock();

    int *arr;
    if ((arr = malloc(sizeof(int) * length)) == NULL)
    {
        perror("The following error occurred");
        exit(-1);
    }

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nAllocating array = %f\n", time_taken);
    time1 = clock();

    for (int i = 0; i < length; i++){
        scanf("%d", &arr[i]);
    }

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nReading array = %f\n", time_taken);
    time1 = clock();

    SORTING_ALGO_CALL(arr, length);

//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nSorting array = %f\n", time_taken);
    time1 = clock();
/*
    for (int i = 0; i < length; i++){
        printf("%d ", arr[i]);
    }
*/
//SECOND READ, PRINT AND NEXT FIRST READ
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nPrinting Sorted array = %f\n", time_taken);
    time1 = clock();

    free(arr);

//SECOND READ, PRINT
    time2 = clock();
    time_taken = (double)(time2 - time1) / CLOCKS_PER_SEC;
    printf("\nFreeing array = %f\n", time_taken);

    return 0;
}

Both merge1.c and merge2.c contain the 2 functions of merge-sort.

The file that I am using for generating worst-case(decreasing order) input for the 2 files.

#include<stdio.h>

int main()
{
    int j = 100000;
    printf("%d\n", j);
    for(int i = j; i > 0; i--)
        printf("%d\n", i);

    return 0;
}

Timing result for merge1.c

Reading length = 23.055000

Allocating array = 0.000000

Reading array = 0.010000

Sorting array = 0.020000

Printing Sorted array = 0.000000

Freeing array = 0.000000

Timing result for merge2.c

Reading length = 22.763000

Allocating array = 0.000000

Reading array = 0.020000

Sorting array = 0.020000

Printing Sorted array = 0.000000

Freeing array = 0.000000

Mats Petersson · Accepted Answer · 2013-07-07T08:47:45.810

2

You can almost certainly beat scanf by writing your own little function to read numbers.

If all numbers are decimal and separated by something that isn't a digit, this would work:

 char buffer[4096]; 
 int bufcount;
 int bufpos;

 int get_next_char()
 {
     if (!bufcount)
     {
         bufcount = fread(buffer, 1, 4096, stdin);
         bufpos = 0;
         if (!bufcount){
            return EOF;
         }
     }
     bufcount--;
     return buffer[bufpos++]; 
 }


 int is_digit(int ch)
 {
     if (ch >= '0' && ch <= '9')
        return 1;
     return 0;
 }

 int readnum()
 {
     int res = 0;
     int ch;
     do
     {
         ch = get_next_char();
     } while(!is_digit(ch) && ch != EOF);
     if (ch == EOF)
     {
        return 0xbaadbeef;    // Don't expect this to happen. 
     }
     do
     {
         res = (res * 10) + (ch - '0');
         ch = get_next_char();
     } while(is_digit(ch));
     return res;
 }

The code in scanf is a lot more complicated than this, and is highly likely to call getc or fgetc, which is a bit less efficient than the above code would be. However, it's worth measuring exactly where you are spending time. Print the time for each function as part of your output.

edited Jul 07 '13 at 08:47

answered Jul 06 '13 at 23:58

Mats Petersson

126,704
14
140
227

Should I use [this method](http://stackoverflow.com/questions/5248915/execution-time-of-c-program) for measuring the time of execution? Also how can I print the time for each function? I mean in merge sort there are many function calls recursively. I am assuming that you are not asking for time of execution of all of them. That would be a huge mess. – Aseem Bansal Jul 07 '13 at 07:04
Yes, clock() works well as long as the time is more than a few tenths of a second. As a starter, I'd measure the time it takes in the input phase and the sort phase. – Mats Petersson Jul 07 '13 at 07:25
I'll try to use Windows Powershell to do the timing. I'll update as soon as I do that. – Aseem Bansal Jul 07 '13 at 07:32
I tried to use Windows Powershell for a simple test case but the code that I am using gave me an infinite loop. What did I do wrong in here? After I figure out the infinite loop I'll post timing results. – Aseem Bansal Jul 07 '13 at 08:42
I missed out a `get_next_char()` call in the second loop, so it just sits there and multiplies the same digit by 10 forever. – Mats Petersson Jul 07 '13 at 08:48
Added the timing results alongwith modified files. – Aseem Bansal Jul 07 '13 at 10:40
You probably want a slightly larger array, by the looks of things. And "reading length" seems wrong to me. 23 seconds? – Mats Petersson Jul 07 '13 at 10:44
I also thought that was weird but these are the results. Maybe due to the while loop there is branching due to which the time is high. These are the results I got by pipling the output of test_file's output to merge1.exe and merge2.exe by using Windows Powershell. – Aseem Bansal Jul 07 '13 at 11:06
1

I have run your code on my machine (I modified the "make numbers" to generate an arbitrary number based on argv), and it doesn't show that problem. Because your code uses a large tmporary on the stack I can't run larger than 2M numbers, but that takes, with my input, 0.13s to read and 0.12s to sort. – Mats Petersson Jul 07 '13 at 11:37
1

And with `scanf` instead of `readnum` it takes 0.4s to read the numbers. – Mats Petersson Jul 07 '13 at 11:38
I tried 256 and 4096 byte buffers. The numbers shown are with 4096 byte buffer size, but no major difference between the two variants. – Mats Petersson Jul 07 '13 at 11:45
1

Even taking the buffer size down to 16 bytes still makes `readnum` about 2x faster than the `scanf` variant, but it's creeping towards 0.2s to read 2M numbers. – Mats Petersson Jul 07 '13 at 11:51

Mike Dunlavey · Answer 2 · 2013-07-07T01:38:33.403

1

I would supplement Mats' answer by, rather than using stdin, having a file name as input. Then open the file (in binary format if on Windows). Get the file length, malloc a big-enough buffer, read the entire file into it, and close the file. Then I would parse using a character pointer into the buffer. That way, getting the next character doesn't require a function call. That's hard to beat for speed.

The code for parsing an integer is:

num = 0;
while(isdigit(*pc)){
  num = num*10 + (*pc++ - '0');
}

edited Jul 07 '13 at 01:38

answered Jul 07 '13 at 01:30

Mike Dunlavey

40,059
14
91
135

Snipet is positive integers only as it currently stands, but yes this is very fast overall. (and i see that the underlying competition is >= 0, so this is all good) – rlb Jul 07 '13 at 01:47
My code should, with a competent compiler, produce roughly the same code. However, it will read smaller blocks, which in my experience is better (as long as the blocks are large enough), since reading very large amounts of data takes time, and the OS will pre-fetch in the background, but it can't do that if you are reading it all as one chunk. And since it's `stdin` that is the input, it may not be possible to check the size of the input in one go anyway. – Mats Petersson Jul 07 '13 at 07:40
In my case I have to use `stdin`. That is a restriction that I **cannot** change. If there is a way to use `stdin` as you have said I would love to hear that. – Aseem Bansal Jul 07 '13 at 07:53
@AseemBansal: In that case, I would use Mats' method, processing it in buffer-size chunks, but I would use parsing code like I wrote, because he puts more faith in compilers than I do :) – Mike Dunlavey Jul 07 '13 at 17:24
Mat actually wrote that same code in his `readnum` function. Well not exactly same wording but the same functionality – Aseem Bansal Jul 07 '13 at 17:28
@AseemBansal: this code is getting down to cycle-counting, where the differences between calling a function and not, incrementing two variables or one, checking for overflow on every character or less often, etc., are starting to matter. If you want to get absolute max speed, try it both ways and single-step it at the assembly-language level. Then see if it it's doing anything that could be drilled/filed/sawed out. OTOH, if it's "fast enough", enjoy! – Mike Dunlavey Jul 07 '13 at 17:35
Are you talking about my question getting down to cycle-counting? I guess you are right about that. I want to know the fastest input method. I don't have the know how to single-step at the assembly level. I think your method should be faster as it does fewer things. But wouldn't removing the check for EOF be a bug? – Aseem Bansal Jul 07 '13 at 17:44
@AseemBansal: I suggest learning how to single-step at the assembly level. It's very much worth-while. As far as removing the check for EOF, you don't. You just check much less often. Like the kids in the back seat asking "Are we there yet?" all the time when you still have 1000 miles to go, it's tedious. Instead, tell them to only ask every 100 miles if the end is in the next 100 miles, etc. You can figure it out. – Mike Dunlavey Jul 07 '13 at 18:37

Galigator · Answer 3 · 2013-07-06T18:36:26.453

0

In optimization problem the rule of thumb is the best. Try to get numerics value of time spent in each step. Loading - sorting - etc... You can use a profiler for that (like gprof).
To faster your IO you should consider doing less call to scanf. As you have the number of scanf require you could design a better algorithm for this specific part.
Scanf do a lot of thing, parse the first arg, then read the byte and convert it to the format. If we want to go faster, we will use 'data problem' to skip some steps. First, we know that we are just using number define on N (math). Second, we know that every bytes are numbers or separators. We can use this.

So we use the read() system call that can read some byte from a file descriptor. The file descriptor for standard input change between operating system, but it is often 0.

The macro algorithm could be :

index = 0
buffer = new array[10000];
numberOfByteRead = 1
while there is byte that have been read at last call of read.
      numberOfByteRead = read said 10000 byte to buffer;
      parse the buffer
;;

parse(buffer,numberOfByteRead)
for all true byte in buffer :
   switch (buffer[0])
      case '0': { the mathematical operation on arr[index] that fit for '0'; break;  }
      case '1': { ... break;}
      case ' ': {index++; break;}
;;

Not a really fun part to code, but faster than scanf. Larger value that 10000, will reduce IO time, but increase memory. You have to balance.

edited Jul 06 '13 at 18:36

answered Jul 06 '13 at 17:40

Galigator

8,957
2
25
39

See [my top results here](http://www.codechef.com/status/TSORT,anshbansal?sort_by=Time&sorting_order=asc&language=All&status=15). The top with time `3.19` uses the `stdlib`'s `qsort` while my `merge sort` takes `3.29`(The 2nd one). So I would guess my implementation isn't the bottleneck on speed. I agree that less calls should be made to `scanf`. But I don't know any other method of reading input. That is why I asked about `other options of faster IO in C`. Any suggestions? – Aseem Bansal Jul 06 '13 at 17:45
Profiling is a good idea. But I usually work on Windows platform so using linux based profiler's may take some work. I'll try to use that. Any suggestions about the algorithm that I can use? – Aseem Bansal Jul 06 '13 at 17:49
This trick could help : if(n%2==1)scanfOneNumber;n--; while(n>0) scanfTwoNumbers. But Imho you should write your own scanf function, specialized to read numbers. – Galigator Jul 06 '13 at 17:52
1

I notice that in your implementation, you are using malloc/free at each merge_parts. I think it is possible to improve that by just mallocing a fresh array at beginning and working with index. It should cost a little less. – Galigator Jul 06 '13 at 17:56
I always thought that every array's size needs to be known at compile time or dynamic memory allocation needs to be done. Thanks for that. It helped to reduce time but it somehow increased the memory usage. Any ideas why? Also I am lost as to how to write my own version of `scanf` function. I have never done anything like that. The way I can think is to use the standard library but that would be the same unless I use some other function than `scanf` in the definition of the new `scanf`. Any pointers as what or where to look for? – Aseem Bansal Jul 06 '13 at 18:10
Also as my timing is better than `qsort` (even with higher memory usage)` does it mean I wrote a better implementation of sorting than the standard library? I know that isn't possible so what could be the reason for this behaviour? – Aseem Bansal Jul 06 '13 at 18:12
It use more memory because if you have n elements to sorts, it use 2*n memory cells, while the previous program use n+(n/2) at max. I will answer to the 'better scanf' in the main answer. – Galigator Jul 06 '13 at 18:22
I found the reason for more memory use. I forgot to take out the malloc from function. That was causing more memory usage. – Aseem Bansal Jul 07 '13 at 11:12

score 0 · Answer 4 · answered Jul 07 '13 at 10:34

static char buff[8*1000000];
int i, length, blen;
int *ap, *p;
int n = 0;
char ch, *cp = buff;

scanf("%d%*c", &length);
p = ap = malloc(sizeof(*ap) * length);

blen = fread(buff, 1, 8*1000000, stdin);
while(blen--){
    if(isdigit(ch=*cp++)){
        n = n * 10 + ch - '0';
    } else {
        *p++ = n;
        n = 0;
    }
}

What are the other options of faster IO in C

4 Answers4

Linked