0

I was recently studying about binary and linear search, and decided to check for real what is the difference in the actual time taken by both of these searching algorithms. Here is my code:

#include <bits/stdc++.h>

using namespace std;

void Print(int arr[], int size) {
    for (int i = 0; i < size; i++)
        cout << arr[i] << " ";
    cout << endl;
}

void Merge_sort(int arr[], int start, int end) {
    
    if (start >= end) {
        return;
    }
    
    int mid = (start + end) / 2;
    Merge_sort(arr, start, mid);
    Merge_sort(arr, mid + 1, end);
    
    int size = end - start + 1;
    
    int arr_new[size];
    
    int i = start, j = mid + 1;
    
    for (int k = 0; k < size; k++) {
        if (i > mid) {
            arr_new[k] = arr[j++];
        } else if (j > end) {
            arr_new[k] = arr[i++];
        } else if (arr[i] < arr[j]) {
            arr_new[k] = arr[i++];
        } else {
            arr_new[k] = arr[j++];
        }
    }
    int index = start;
    for (int k = 0; k < size; k++) {
        arr[index++] = arr_new[k];
    }
}

int binary_search(int arr[], int start, int end, int x) {
    if (start > end) {
        return -1;
    }
    
    int mid = (start + end) / 2;
    if (arr[mid] == x) {
        return mid;
    } else if (arr[mid] > x) {
        return binary_search(arr, start, mid - 1, x);
    } else {
        return binary_search(arr, mid + 1, end, x);
    }
}

int linear_search(int arr[], int size, int x) {
    for (int i = 0; i < size; i++) {
        if (arr[i] == x) {
            return i;
        }
    }
    return -1;
}

int main() {
    int size;
    
    clock_t start, start1, end, end1;
    double time1, time2;
    
    cin >> size;
    
    int *arr1 = new int[size];
    int *arr2 = new int[size];
    
    for (int i = 0; i < size; i++) {
        int j = rand() % 1000;
        arr1[i] = i;
        arr2[i] = i;
    }
    
    Merge_sort(arr1, 0, size - 1);
    start = clock();
    cout << binary_search(arr1, 0, size - 1, 15) << endl;
    end = clock();
    time1 = (double)(end - start) / (double)(CLOCKS_PER_SEC);
    cout << "Time taken by binary search  is : \t" << fixed
        << time1 << setprecision(8);
    cout << " sec " << endl;
    
    start1 = clock();
    cout << linear_search(arr2, size, 15) << endl;
    end1 = clock();
    time2 = (double)(end1 - start1) / (double)(CLOCKS_PER_SEC);
    cout << "Time taken by linear search  is : \t" << fixed
        << time2 << setprecision(8);
    cout << " sec " << endl;
    return 0;
}

I have used Merge sort to sort the array for the binary search part, but have excluded it while calculating the time taken. Kindly take a look and tell me where am I going wrong. As mentioned in a lot of comments, I am adding one result I got on my online IDE.

size = 10000, linear = 0.000005 sec, binary = 0.00005 sec
chqrlie
  • 131,814
  • 10
  • 121
  • 189
  • 1
    And what were your findings? ("Binary search taking more time than linear search" is very vague.) Did you use data of any meaningful size? Why are you measuring the time it takes to output the result? Are you absolutely sure that your sorting is correct? (Why didn't you use `std::sort`?) – molbdnilo Mar 15 '22 at 17:55
  • I don't believe this is [the actual code you used](https://godbolt.org/z/9nKnr5n46). Regardless, binary search _scales_ better than linear search, which is not the same as a guarantee that it is faster. With an array size of `1`, you should see that your binary search would do more math, more comparisons, and potentially recurse. – Drew Dormann Mar 15 '22 at 17:58
  • 2
    Note that your arrays are sorted to begin with and, if they have at least 16 elements, 15 is always the 16th element. Once you fix that bug, you will see that the binary search soon outpaces the linear search as the size grows. (You should have used the `Print` function you took the trouble to write...) – molbdnilo Mar 15 '22 at 18:12
  • The Binary Search has a overhead (calculating the next index), which takes time. Depending on the size of the data, a linear search may be faster (linear search has minimal overhead calculating the next index). The question is about whether the time is negligible or not. – Thomas Matthews Mar 15 '22 at 18:13
  • What C++ reference book are you using that says to use `#include `? This is not a standard and may be an extension on some compilers. – Thomas Matthews Mar 15 '22 at 18:17
  • Side note: Don't use `#include ` ([reason](https://stackoverflow.com/questions/31816095/why-should-i-not-include-bits-stdc-h)) and avoid using `using namespace std;` ([reason](https://stackoverflow.com/questions/1452721/why-is-using-namespace-std-considered-bad-practice)). Using the two together amplifies their worst effects and can result in some truly bizarre problems. – user4581301 Mar 15 '22 at 18:18
  • For more or less uniform data you should also include dictionary sort. And do draw some nice graphs showing sizes going from 1 to millions. – Goswin von Brederlow Mar 15 '22 at 18:56
  • @molbdnilo The time taken by binary search were almost 2 to 3 times more than linear search , obviously the time was in microseconds or even less , but still binary search was taking 3 to 3 times more time. I tested the code for array sizes of 10 , 100 , 1000 , 10000 , 100000 but still no significant change , binary search was taking way more time than it should have. Moreover I tried searching for the middle element as well , still binary search was taking more time than linear search and I just dont know how. – Prakhar Shankar Mar 16 '22 at 10:03
  • @molbdnilo Sorry for the Print function , i actually wrote that while I was trying to debug my code. – Prakhar Shankar Mar 16 '22 at 10:03
  • @DrewDormann I can assure you that this is the exact code that I used. I have simply copied it from my online IDE. getting to your next point , I even tried this with array size of 100000. The binary search was taking way more time than linear search. – Prakhar Shankar Mar 16 '22 at 10:05
  • @user4581301 Hey sir , I am a newbie in this . I read on geeksforgeeks that #include includes all the standard files present. so I did it. Your next point is actually a bit confusing. I am a newbie , can you tell me an alternative for that. Thankyou. – Prakhar Shankar Mar 16 '22 at 10:07
  • @ThomasMatthews I read it on geeksforgeeks to use #include it basically includes all the standard files. – Prakhar Shankar Mar 16 '22 at 10:07
  • @PrakharShankar It's nothing to be sorry for. You should have use it a bit more while debugging, to verify that your input really is what you think it is. – molbdnilo Mar 16 '22 at 12:21
  • Note: Geeks For Geeks has very limited curation. Any fool can post anything they want, and there are a lot of fools out there. Yes, it includes all of Standard library headers. That's a LOT of files. If you use it wrong it'll take around ten times as long to build your program because every single one of those files needs to be compiled. – user4581301 Mar 16 '22 at 15:17
  • It adds the tens of thousands of identifiers in the Standard Library to your program. And with `using namespace std;` all of those identifiers will conflict with your code in the global namespace, and a lot of those identifiers have very common names like `size`, `reverse`, `count`, `copy`, `remove`. You risk nigh-inscrutable compiler errors or strange runtime behaviour if you reuse these identifiers. Worse, you might not see these errors, but whoever is marking your code may have a different set-up and see them. – user4581301 Mar 16 '22 at 15:17

4 Answers4

3

I was just watching an interesting CPP Con video on YouTube about sorting, and part of it addressed linear versus binary searching.

Modern CPUs do some very interesting predictive work. They may guess ahead and can be a LOT more efficient if you follow the path they think your code is going to take. Binary searches make it impossible for that to work, so they can actually be slower on modern hardware where older hardware didn't do this, and thus binary searches were typically much faster as soon as the sample size grew to something over maybe as few as 20 values.

https://www.youtube.com/watch?v=FJJTYQYB1JQ&t=2272s

So to some extent, your sample size will matter, but this talk may also shed some light.

Joseph Larson
  • 8,530
  • 1
  • 19
  • 36
  • 1
    The topic about CPUs working much faster if the expected path is taken is called "Branch Prediction". You can also read about it at [Why is processing a sorted array faster than processing an unsorted array?](https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array) which I believe is by far single highest voted question with the c++ tag. The answer is very well written, in my opinion. c++20 introduces [attributes](https://en.cppreference.com/w/cpp/language/attributes/likely) to help with it called `[[likely]]` and `[[unlikely]]`. – François Andrieux Mar 15 '22 at 18:46
  • @FrançoisAndrieux The whole thing is pretty fascinating and makes my brain hurt when I think about it. – Joseph Larson Mar 16 '22 at 00:03
1

The problem is very simple and a good C++ compiler points to it immediately with an adequate warning level:

c++ -O3 -Weverything -o binsearch binsearch.cpp
binsearch.cpp:81:13: warning: unused variable 'j' [-Wunused-variable]
        int j = rand() % 1000;

This warning points to a major problem in this loop in the main() function:

    for (int i = 0; i < size; i++) {
        int j = rand() % 1000;
        arr1[i] = i;
        arr2[i] = i;
    }

rand() is not used and both arrays are initialized to the same monotonic sequence from 0 to size-1.

When you measure the time it takes to find the value 15, the result will likely favor linear search because 15 appears in 16th position in both arrays, very close to the beginning, hence it is found very quickly in a linear search and with approximately the same number of steps for binary search in a 10000 element sorted array.

There are more problems:

  • You include the time take to format the output in the measurement, which is incorrect as it may dominate the benchmark. It is recommended to perform all measurements in silent mode and display the results afterwards.

  • You should also perform multiple searches: for numbers appearing at the beginning, middle and end of the array and for numbers not appearing in the arrays.

  • You should repeat the searches to get more meaningful timings for functions that run quickly because clock() may have a limited granularity (1 microsecond on OS/X).

Here is a modified version with a benchmark for various sizes:

#include <bits/stdc++.h>

using namespace std;

static void Merge_sort(int arr[], int start, int end) {

    if (start >= end) {
        return;
    }

    int mid = start + (end - start) / 2;
    Merge_sort(arr, start, mid);
    Merge_sort(arr, mid + 1, end);

    int size = end - start + 1;
    int *arr_new = new int[size];
    int i = start, j = mid + 1;

    for (int k = 0; k < size; k++) {
        if (i > mid) {
            arr_new[k] = arr[j++];
        } else if (j > end) {
            arr_new[k] = arr[i++];
        } else if (arr[i] < arr[j]) {
            arr_new[k] = arr[i++];
        } else {
            arr_new[k] = arr[j++];
        }
    }
    int index = start;
    for (int k = 0; k < size; k++) {
        arr[index++] = arr_new[k];
    }
    delete[] arr_new;
}

static int binary_search(int arr[], int start, int end, int x) {
    if (start > end) {
        return -1;
    }
    int mid = start + (end - start) / 2;
    if (arr[mid] == x) {
        return mid;
    } else if (arr[mid] > x) {
        return binary_search(arr, start, mid - 1, x);
    } else {
        return binary_search(arr, mid + 1, end, x);
    }
}

static int linear_search(int arr[], int size, int x) {
    for (int i = 0; i < size; i++) {
        if (arr[i] == x) {
            return i;
        }
    }
    return -1;
}

static int test(int size) {
    int *arr1 = new int[size];
    int *arr2 = new int[size];

    for (int i = 0; i < size; i++) {
        int j = rand();
        arr1[i] = j;
        arr2[i] = j;
    }

    Merge_sort(arr1, 0, size - 1);

    clock_t start1 = clock();
    int index1[4] = {};
    for (int i = 0; i < 100; i++) {
        index1[0] = binary_search(arr1, 0, size - 1, arr1[0]);
        index1[1] = binary_search(arr1, 0, size - 1, arr1[size / 2]);
        index1[2] = binary_search(arr1, 0, size - 1, arr1[size - 1]);
        index1[3] = binary_search(arr1, 0, size - 1, -1);
    }
    clock_t end1 = clock();

    clock_t start2 = clock();
    int index2[4] = {};
    for (int i = 0; i < 100; i++) {
        index2[0] = linear_search(arr2, size, arr1[0]);
        index2[1] = linear_search(arr2, size, arr1[size / 2]);
        index2[2] = linear_search(arr2, size, arr1[size - 1]);
        index2[3] = linear_search(arr2, size, -1);
    }
    clock_t end2 = clock();

    double time1 = (end1 - start1) * 1000.0 / CLOCKS_PER_SEC;
    double time2 = (end2 - start2) * 1000.0 / CLOCKS_PER_SEC;

    printf("size=%8d: binary searches: %8.3fms, %d %d %d %d\n",
           size, time1, index1[0], index1[1], index1[2], index1[3]);
    printf("size=%8d: linear searches: %8.3fms, %d %d %d %d\n",
           size, time2, index2[0], index2[1], index2[2], index2[3]);

    delete[] arr1;
    delete[] arr2;
    return 0;
}

int main() {
    for (int size = 1; size <= 10000000; size *= 10) {
        test(size);
    }
    return 0;
}

Output:

size=       1: binary searches:    0.002ms, 0 0 0 -1
size=       1: linear searches:    0.000ms, 0 0 0 -1
size=       2: binary searches:    0.002ms, 0 1 1 -1
size=       2: linear searches:    0.002ms, 0 1 1 -1
size=       5: binary searches:    0.002ms, 0 2 4 -1
size=       5: linear searches:    0.002ms, 3 0 4 -1
size=      10: binary searches:    0.003ms, 0 5 9 -1
size=      10: linear searches:    0.003ms, 9 7 1 -1
size=      20: binary searches:    0.004ms, 0 10 19 -1
size=      20: linear searches:    0.004ms, 15 18 5 -1
size=      50: binary searches:    0.004ms, 0 25 49 -1
size=      50: linear searches:    0.005ms, 44 24 0 -1
size=     100: binary searches:    0.006ms, 0 50 99 -1
size=     100: linear searches:    0.012ms, 34 91 0 -1
size=     200: binary searches:    0.006ms, 0 100 199 -1
size=     200: linear searches:    0.022ms, 62 123 58 -1
size=     500: binary searches:    0.006ms, 0 250 499 -1
size=     500: linear searches:    0.050ms, 47 389 252 -1
size=    1000: binary searches:    0.006ms, 0 500 999 -1
size=    1000: linear searches:    0.121ms, 902 808 422 -1
size=    2000: binary searches:    0.008ms, 0 1000 1999 -1
size=    2000: linear searches:    0.369ms, 733 1992 1866 -1
size=    5000: binary searches:    0.010ms, 0 2500 4999 -1
size=    5000: linear searches:    0.656ms, 2635 4605 1819 -1
size=   10000: binary searches:    0.015ms, 0 5000 9999 -1
size=   10000: linear searches:    1.137ms, 6722 8419 5128 -1
size=   20000: binary searches:    0.012ms, 0 10000 19999 -1
size=   20000: linear searches:    2.265ms, 16893 6637 10738 -1
size=   50000: binary searches:    0.013ms, 0 25000 49999 -1
size=   50000: linear searches:    4.538ms, 19705 40371 9661 -1
size=  100000: binary searches:    0.014ms, 0 50000 99999 -1
size=  100000: linear searches:    8.565ms, 42378 57447 33679 -1
size=  200000: binary searches:    0.020ms, 0 100000 199999 -1
size=  200000: linear searches:   26.003ms, 14037 198103 158988 -1
size=  500000: binary searches:    0.016ms, 0 250000 499999 -1
size=  500000: linear searches:   33.657ms, 162357 162899 18194 -1

As you can see, binary search is much faster in the general case for large sizes, but linear search is much simpler and has faster or equivalent timings for up to 20 elements. A non recursive version of binary_search might lower the threshold a little bit. Note however that your implementation of binary_search does not always return the index of the first occurrence in case of duplicates.

chqrlie
  • 131,814
  • 10
  • 121
  • 189
  • amazingly explained sir , thanking you a ton. – Prakhar Shankar Mar 24 '22 at 19:32
  • SIr can i disturb you one more time? Can you please explain the meaning of the first point which goes like-: You include the time take to format the output in the measurement, which is incorrect as it may dominate the benchmark. It is recommended to perform all measurements in silent mode and display the results afterwards. Also what is the meaning of granularity in case of clock() and how is it getting affected? – Prakhar Shankar Mar 24 '22 at 19:38
  • Also one more thing , why you had a return type of static int? – Prakhar Shankar Mar 24 '22 at 19:40
0

If the input size is large enough, binary search is indeed faster than linear search. When computer scientists talk about one algorithm being faster than another one, they mean that the time complexity of one algorithm is better than the other one.

Binary search has worst time complexity O(lgn), where lgn means 2-based logarithm of n.

Linear search has worst time complexity O(n), where n.

In both of these cases, n is the size of the input, means the size of the set of numbers you are searching.

Formally, O(g(n)) is defined as, The set of functions f(n) such that there exists a positive integer n_0 such that for n >= n_0, there exists a constant 0 <= f(n) <= c1*g(n)

All the mathematics and formal definitions aside, in summary, binary search is better than linear search in long term. By long term I mean if your input size is big enough. I am assuming you ran the program using less numbers, like 10 or maybe a 100. But when you have a large amount of data, say, 10^5 numbers, then binary search is always going to be faster.

Also, an algorithm is better than other usually means in worst case. That is, if the number you are searching for is in the beginning of array, then on the first try you will get that number. But for binary search, it will take a few number of steps.

For linear search, the worst case is if the number that you are searching for is at the end of the list.

So in worst cases and average cases, binary search performs better.

In your program, you have used recursion for binary searching and a for loop for linear search. Recursion has more overhead than a loop. And the compiler does some smart preprocessing when you are using a loop as @JosephLarson mentioned. But if you run your code on large enough inputs, I am assuming that you will find that binary search is faster.

Also, you can compare binary search to finding a specific page number in a book. Binary search will always be faster than going one page at a time.

  • thankyou sir, but i will like to tell you , I ran the code for sample sizes of 100000 , still linear search was outperforming my binary search. – Prakhar Shankar Mar 16 '22 at 10:10
0

The code in the question includes the time for std::cout to output data. Try using a non-recursive binary search:

int BinSrc(int a[], int cnt, int x)
{
int lo = 0;
int hi = cnt;
int i = 0;
    while((hi - lo) > 1){
        i = (lo + hi)/2;
        if(x <  a[i]){
            hi = i;
            continue;
        }
        if(x >  a[i]){
            lo = i;
            continue;
        }
        break;
    }
    if(x == a[i])
        return i;
    return cnt;
}

Test program

#include <chrono>
#include <iostream>
#include <iomanip>

//----------------------------------------------------------------------//
//      BinSrc                                                          //
//----------------------------------------------------------------------//
int BinSrc(int a[], int cnt, int x)
{
int lo = 0;
int hi = cnt;
int i = 0;
    while((hi - lo) > 1){
        i = (lo + hi)/2;
        if(x <  a[i]){
            hi = i;
            continue;
        }
        if(x >  a[i]){
            lo = i;
            continue;
        }
        break;
    }
    if(x == a[i])
        return i;
    return cnt;
}

//----------------------------------------------------------------------//
//      LinSrc                                                          //
//----------------------------------------------------------------------//
int LinSrc(int a[], int cnt, int x)
{
    int i;
    for(i = 0; i < cnt; i++)
        if(x == a[i])
            return i;
    return cnt;
}

#define COUNT (10000)
#define GCC 0

int main()
{
    int * a = new int [COUNT];    // allocate data array
    std::chrono::high_resolution_clock::time_point binbeg, binend, linbeg, linend;
    int i, j;
    for(int i = 0; i < COUNT; i++)
        a[i] = i;
#if GCC
    linbeg = std::chrono::_V2::system_clock::now();
    i = LinSrc(a, COUNT, COUNT-1);
    linend = std::chrono::_V2::system_clock::now();
    binbeg = std::chrono::_V2::system_clock::now();
    j = BinSrc(a, COUNT, COUNT-1);
    binend = std::chrono::_V2::system_clock::now();
#else
    linbeg = std::chrono::steady_clock::now();
    i = LinSrc(a, COUNT, COUNT-1);
    linend = std::chrono::steady_clock::now();
    binbeg = std::chrono::steady_clock::now();
    j = BinSrc(a, COUNT, COUNT-1);
    binend = std::chrono::steady_clock::now();
#endif
    std::cout << "Time taken by linear search  is : \t" << std::fixed << std::setprecision(8)
        << (std::chrono::duration_cast<std::chrono::nanoseconds>(linend - linbeg).count())/1000000000.;
    std::cout << " sec " << std::endl;
    std::cout << "Time taken by binary search  is : \t" << std::fixed << std::setprecision(8)
        << (std::chrono::duration_cast<std::chrono::nanoseconds>(binend - binbeg).count())/1000000000.;
    std::cout << " sec " << std::endl;
    delete a;
    if(i != j)
        std::cout << "error" << std::endl;
    return(0);
}
rcgldr
  • 27,407
  • 3
  • 36
  • 61