Conversion of pseudo-code to C code
The question asks about a code fragment which is not a working implementation of a binary search function because it does not handle the case where the sought-for value is absent from the array.
The code can be converted to a working C code function like this:
int binary_search(int N, const int A[N], int val)
{
int i, step;
for (step = 1; step < N; step <<= 1)
;
for (i = 0; step; step >>= 1)
if (i + step < N && A[i + step] <= val)
i += step;
if (A[i] != val)
i = -1;
return i;
}
This has been tested in a rigorous test harness and it produces the equivalent answer to other variations of binary search tested in the same harness.
Comparison with alternative binary search algorithms
I happen to have 4 other implementations of binary search lurking in my folder of material related to SO questions. Given a program that can generate random numbers in a given range, and some supporting scripts, plus timing code that can report elapsed times to microseconds (it uses gettimeofday()
on Mac OS X, much to the disgust of some, but that's good enough in this context), I generated this timing program. It includes algorithms:
BinSearch_A
— find an arbitrary index P in array X[0..N-1] that matches T.
BinSearch_B
— find the smallest index P in array X[0..N-1] that matches T.
BinSearch_C
— find the largest index P in array X[0..N-1] that matches T.
BinSearch_D
— find both smallest (L) and largest (U) indexes in array X[0..N-1] that match T.
BinSearch_E
— find an arbitrary index P in array X[0..N-1] that matches T (using the algorithm in the question as amended above).
Note that algorithms B, C and D are solving strictly harder problems than A and E solve so it is to be expected that B, C, and D will be slower than A and E.
Outline code (binsearch-speed-1.c
)
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct Pair
{
int lo;
int hi;
} Pair;
extern Pair BinSearch_D(int N, const int X[N], int T);
extern int BinSearch_A(int N, const int X[N], int T);
extern int BinSearch_B(int N, const int X[N], int T);
extern int BinSearch_C(int N, const int X[N], int T);
extern int BinSearch_E(int N, const int X[N], int T);
#ifndef lint
extern const char jlss_id_modbinsearch_c[];
const char jlss_id_modbinsearch_c[] = "@(#)$Id$";
#endif
int BinSearch_A(int N, const int X[N], int T)
{
int L = 0;
int U = N-1;
while (1)
{
if (L > U)
return -1;
int M = (L + U) / 2;
if (X[M] < T)
L = M + 1;
else if (X[M] > T)
U = M - 1;
else
return M;
}
assert(0);
}
int BinSearch_B(int N, const int X[N], int T)
{
int L = -1;
int U = N;
while (L + 1 != U)
{
int M = (L + U) / 2;
if (X[M] < T)
L = M;
else
U = M;
}
assert(L+1 == U && (L == -1 || X[L] < T) && (U >= N || X[U] >= T));
int P = U;
if (P >= N || X[P] != T)
P = -1;
return P;
}
int BinSearch_C(int N, const int X[N], int T)
{
int L = -1;
int U = N;
while (L + 1 != U)
{
int M = (L + U) / 2;
if (X[M] <= T)
L = M;
else
U = M;
}
assert(L+1 == U && (L == -1 || X[L] <= T) && (U >= N || X[U] > T));
int P = L;
if (P < 0 || X[P] != T)
P = -1;
return P;
}
Pair BinSearch_D(int N, const int X[N], int T)
{
int L_lo = -1;
int L_hi = N;
int U_lo = -1;
int U_hi = N;
while (L_lo + 1 != L_hi || U_lo + 1 != U_hi)
{
if (L_lo + 1 != L_hi)
{
int L_md = (L_lo + L_hi) / 2;
if (X[L_md] < T)
L_lo = L_md;
else
L_hi = L_md;
}
if (U_lo + 1 != U_hi)
{
int U_md = (U_lo + U_hi) / 2;
if (X[U_md] <= T)
U_lo = U_md;
else
U_hi = U_md;
}
}
assert(L_lo+1 == L_hi && (L_lo == -1 || X[L_lo] < T) && (L_hi >= N || X[L_hi] >= T));
int L = L_hi;
if (L >= N || X[L] != T)
L = -1;
assert(U_lo+1 == U_hi && (U_lo == -1 || X[U_lo] <= T) && (U_hi >= N || X[U_hi] > T));
int U = U_lo;
if (U < 0 || X[U] != T)
U = -1;
return (Pair) { .lo = L, .hi = U };
}
int BinSearch_E(int N, const int X[N], int T)
{
int i, step;
for (step = 1; step < N; step <<= 1)
;
for (i = 0; step; step >>= 1)
if (i + step < N && X[i + step] <= T)
i += step;
if (X[i] != T)
i = -1;
return i;
}
#include "timer.h"
static const int numbers[] =
{
10000, 10002, 10003, 10003, 10003, 10004, 10006, 10010, 10011, 10015,
10016, 10020, 10023, 10024, 10029, 10029, 10030, 10031, 10032, 10035,
10036, 10036, 10037, 10037, 10038, 10041, 10043, 10044, 10046, 10049,
10066, 10066, 10069, 10070, 10071, 10074, 10079, 10080, 10085, 10086,
10087, 10089, 10090, 10090, 10090, 10091, 10092, 10094, 10095, 10095,
…990 similar lines omitted…
29869, 29870, 29872, 29872, 29874, 29877, 29877, 29882, 29884, 29888,
29895, 29898, 29899, 29908, 29912, 29922, 29923, 29924, 29925, 29929,
29934, 29936, 29938, 29939, 29941, 29942, 29943, 29943, 29944, 29945,
29947, 29949, 29951, 29953, 29956, 29958, 29959, 29959, 29964, 29965,
29965, 29966, 29968, 29969, 29981, 29983, 29983, 29984, 29984, 29988,
};
enum { NUM_NUMBERS = sizeof(numbers) / sizeof(numbers[0]) };
static void check_sorted(const char *a_name, int size, const int array[size])
{
int ok = 1;
for (int i = 1; i < size; i++)
{
if (array[i-1] > array[i])
{
fprintf(stderr, "Out of order: %s[%d] = %d, %s[%d] = %d\n",
a_name, i-1, array[i-1], a_name, i, array[i]);
ok = 0;
}
}
if (!ok)
exit(1);
}
static int BinSearch_D1(int size, const int array[size], int value)
{
Pair p = BinSearch_D(size, array, value);
return p.lo;
}
typedef int (*BinSearch)(int size, const int data[size], int value);
static void time_search(const char *a_name, int size, const int array[size],
BinSearch function)
{
Clock clk;
clk_init(&clk);
int x0 = array[0] - 1;
int x1 = array[size-1] + 2;
long long vsum = 0;
clk_start(&clk);
for (int i = x0; i < x1; i++)
{
int index = (*function)(size, array, i);
vsum += (index == -1) ? index : array[index];
}
clk_stop(&clk);
char buffer[32];
printf("%s: (%d) %lld %s\n", a_name, size, vsum,
clk_elapsed_us(&clk, buffer, sizeof(buffer)));
}
int main(void)
{
check_sorted("numbers", NUM_NUMBERS, numbers);
for (int i = 0; i < 10; i++)
{
time_search("BinSearch_A", NUM_NUMBERS, numbers, BinSearch_A);
time_search("BinSearch_B", NUM_NUMBERS, numbers, BinSearch_B);
time_search("BinSearch_C", NUM_NUMBERS, numbers, BinSearch_C);
time_search("BinSearch_D", NUM_NUMBERS, numbers, BinSearch_D1);
time_search("BinSearch_E", NUM_NUMBERS, numbers, BinSearch_E);
}
return 0;
}
This code works with an array of 10,000 random numbers, with repeats, in the range 10,000 to 29,999. This means that approximately half the possible values in the range are present in the array. For each function, it computes the index of each value in the range from the smallest number in the array - 1 to the largest number in the array + 1. Because the algorithms legitimately return different indexes when there are multiple matches possible, the test code sums the found values (and subtracts 1 for each search failure). The output identifies the time taken in microseconds, and prints the array size and the computed value. One reason for the computation is to ensure the optimizer doesn't optimize too much.
I also generated a second program (binsearch-speed-2.c
) with the same code but with 1,000,000 numbers in the range 1,000,000 to 3,000,000. Since the times for binsearch-speed-1.c
were in the range 0.7 - 1.4 milliseconds, the amount of data was a bit smaller than I regard as comfortable, so I increased the problem size by 100 to generate correspondingly bigger times. However, the bigger scale problem changed the relative timings of the algorithms (which is the reason you get to see this).
The tests were run an oldish MacBook Pro (Early 2011) with 2.3 GHz Intel Core i7 CPU and 16 GiB of 1333 MHz DDR3 memory, running Mac OS X 10.11.4, and using GCC 5.3.0. Your mileage will vary!
Sample compilation command line:
$ gcc -O3 -g -I$HOME/inc -std=c11 -Wall -Wextra -Wmissing-prototypes -Wstrict-prototypes \
> -Wold-style-definition -Werror binsearch-speed-2.c -o binsearch-speed-2 \
> -L$HOME/lib/64 -ljl
$
The timing functions are in the library referenced.
Raw Results binsearch-speed-1
(Size 10,000)
BinSearch_A: (10000) 158341368 0.000817
BinSearch_B: (10000) 158341368 0.001076
BinSearch_C: (10000) 158341368 0.001006
BinSearch_D: (10000) 158341368 0.001337
BinSearch_E: (10000) 158341368 0.000787
BinSearch_A: (10000) 158341368 0.000771
BinSearch_B: (10000) 158341368 0.001540
BinSearch_C: (10000) 158341368 0.001003
BinSearch_D: (10000) 158341368 0.001344
BinSearch_E: (10000) 158341368 0.000791
BinSearch_A: (10000) 158341368 0.000799
BinSearch_B: (10000) 158341368 0.001078
BinSearch_C: (10000) 158341368 0.001008
BinSearch_D: (10000) 158341368 0.001386
BinSearch_E: (10000) 158341368 0.000802
BinSearch_A: (10000) 158341368 0.000774
BinSearch_B: (10000) 158341368 0.001083
BinSearch_C: (10000) 158341368 0.001176
BinSearch_D: (10000) 158341368 0.001495
BinSearch_E: (10000) 158341368 0.000907
BinSearch_A: (10000) 158341368 0.000817
BinSearch_B: (10000) 158341368 0.001080
BinSearch_C: (10000) 158341368 0.001007
BinSearch_D: (10000) 158341368 0.001357
BinSearch_E: (10000) 158341368 0.000786
BinSearch_A: (10000) 158341368 0.000756
BinSearch_B: (10000) 158341368 0.001080
BinSearch_C: (10000) 158341368 0.001899
BinSearch_D: (10000) 158341368 0.001644
BinSearch_E: (10000) 158341368 0.000791
BinSearch_A: (10000) 158341368 0.000770
BinSearch_B: (10000) 158341368 0.001087
BinSearch_C: (10000) 158341368 0.001014
BinSearch_D: (10000) 158341368 0.001378
BinSearch_E: (10000) 158341368 0.000793
BinSearch_A: (10000) 158341368 0.001415
BinSearch_B: (10000) 158341368 0.001160
BinSearch_C: (10000) 158341368 0.001006
BinSearch_D: (10000) 158341368 0.001336
BinSearch_E: (10000) 158341368 0.000786
BinSearch_A: (10000) 158341368 0.000763
BinSearch_B: (10000) 158341368 0.001079
BinSearch_C: (10000) 158341368 0.001012
BinSearch_D: (10000) 158341368 0.001309
BinSearch_E: (10000) 158341368 0.000796
BinSearch_A: (10000) 158341368 0.000769
BinSearch_B: (10000) 158341368 0.001094
BinSearch_C: (10000) 158341368 0.001029
BinSearch_D: (10000) 158341368 0.001397
BinSearch_E: (10000) 158341368 0.000800
Raw Results binsearch-speed-2
(Size 1,000,000)
BinSearch_A: (1000000) 1573140220897 0.081161
BinSearch_B: (1000000) 1573140220897 0.137057
BinSearch_C: (1000000) 1573140220897 0.132743
BinSearch_D: (1000000) 1573140220897 0.166290
BinSearch_E: (1000000) 1573140220897 0.189696
BinSearch_A: (1000000) 1573140220897 0.083374
BinSearch_B: (1000000) 1573140220897 0.136225
BinSearch_C: (1000000) 1573140220897 0.128654
BinSearch_D: (1000000) 1573140220897 0.168078
BinSearch_E: (1000000) 1573140220897 0.190977
BinSearch_A: (1000000) 1573140220897 0.083391
BinSearch_B: (1000000) 1573140220897 0.135630
BinSearch_C: (1000000) 1573140220897 0.131179
BinSearch_D: (1000000) 1573140220897 0.168578
BinSearch_E: (1000000) 1573140220897 0.188785
BinSearch_A: (1000000) 1573140220897 0.083069
BinSearch_B: (1000000) 1573140220897 0.135803
BinSearch_C: (1000000) 1573140220897 0.136248
BinSearch_D: (1000000) 1573140220897 0.170167
BinSearch_E: (1000000) 1573140220897 0.188973
BinSearch_A: (1000000) 1573140220897 0.084509
BinSearch_B: (1000000) 1573140220897 0.145219
BinSearch_C: (1000000) 1573140220897 0.129374
BinSearch_D: (1000000) 1573140220897 0.168213
BinSearch_E: (1000000) 1573140220897 0.186770
BinSearch_A: (1000000) 1573140220897 0.086911
BinSearch_B: (1000000) 1573140220897 0.141995
BinSearch_C: (1000000) 1573140220897 0.134353
BinSearch_D: (1000000) 1573140220897 0.169639
BinSearch_E: (1000000) 1573140220897 0.194442
BinSearch_A: (1000000) 1573140220897 0.082882
BinSearch_B: (1000000) 1573140220897 0.135095
BinSearch_C: (1000000) 1573140220897 0.129635
BinSearch_D: (1000000) 1573140220897 0.166059
BinSearch_E: (1000000) 1573140220897 0.186700
BinSearch_A: (1000000) 1573140220897 0.083190
BinSearch_B: (1000000) 1573140220897 0.134491
BinSearch_C: (1000000) 1573140220897 0.130103
BinSearch_D: (1000000) 1573140220897 0.169454
BinSearch_E: (1000000) 1573140220897 0.188583
BinSearch_A: (1000000) 1573140220897 0.083038
BinSearch_B: (1000000) 1573140220897 0.135738
BinSearch_C: (1000000) 1573140220897 0.129727
BinSearch_D: (1000000) 1573140220897 0.169101
BinSearch_E: (1000000) 1573140220897 0.188749
BinSearch_A: (1000000) 1573140220897 0.082099
BinSearch_B: (1000000) 1573140220897 0.135025
BinSearch_C: (1000000) 1573140220897 0.130743
BinSearch_D: (1000000) 1573140220897 0.168684
BinSearch_E: (1000000) 1573140220897 0.188640
Statistics on raw data
Program Algorithm Tests Avg Time Std Dev
binsearch-speed-1 BinSearch_A 10 0.0008451 0.0002014
BinSearch_B 10 0.0011357 0.0001442
BinSearch_C 10 0.0011160 0.0002801
BinSearch_D 10 0.0013983 0.0001003
BinSearch_E 10 0.0008039 0.0000366
binsearch-speed-2 BinSearch_A 10 0.0833624 0.0015203
BinSearch_B 10 0.1372278 0.0035168
BinSearch_C 10 0.1312759 0.0024403
BinSearch_D 10 0.1684263 0.0013514
BinSearch_E 10 0.1892315 0.0022148
Provisional conclusions
When the problem size was ten thousand, then the 'shift' algorithm (BinSearch_E
) seemed to perform a little better than the simple 'midpoint' algorithm (BinSearch_A
), but the difference was not obviously significant — I've not run a T-Test or similar on the data.
When the problem size was one million, the 'midpoint' algorithm was better than than the 'shift' algorithm by a significant margin, and indeed it performed worse than the three more complex algorithms.
This was unexpected (especially being worse than the more complex algorithms); I expected the two algorithms to essentially the same. I don't have a good explanation for why. This sort of result shows why benchmarking is hard. If I had to guess, I'd suspect that despite only need about 4 MiB of memory for the bigger array, the access pattern on the elements for the 'shift' algorithm means that the caches are less effective. Proving (or disproving) that would be hard — it would require a better performance testing engineer than I am.
Given the consistent performance of Algorithm A (simple midpoint), which scaled essentially linearly for the tests performed, I would use that rather than Algorithm E — unless I could demonstrate that Algorithm E (shift) gave better performance on the size of data sets that the code was working with.
And this shows why benchmarking is important if the utmost in performance is important.