3

I'm testing HotSpot JIT array bound checks elimination capabilities. Here are two versions of the same heapsort implementation, one use ordinary array indexing, another sun.misc.Unsafe API, free of bound checks:

public class HeapSort {
    // copied from http://en.wikibooks.org/wiki/Algorithm_Implementation/Sorting/Heapsort#C
    static int heapSortSimple(int[] arr) {
        int t;
        int n = arr.length, parent = n / 2, index, childI;
        while (true) {
            if (parent > 0) {
                t = arr[--parent]; // 1, i. e. first indexing
            } else {
                if (--n == 0) break;
                t = arr[n]; // 2
                arr[n] = arr[0]; // 3, 4
            }
            index = parent;
            childI = (index << 1) + 1;
            while (childI < n) {
                int childV = arr[childI]; // 5
                int right;
                if (childI + 1 < n && (right = arr[childI + 1]) > childV) { // 6
                    childI++;
                    childV = right;
                }
                if (childV > t) {
                    arr[index] = childV; // 7
                    index = childI;
                    childI = (index << 1) + 1;
                } else {
                    break;
                }
            }
            arr[index] = t; // 8
        }
        return arr[arr.length - 1];
    }

    static int heapSortUnsafe(int[] arr) {
        int t;
        long n = arr.length * INT_SCALE, parent = (arr.length / 2) * INT_SCALE, index, childI;
        while (true) {
            if (parent > 0) {
                t = U.getInt(arr, INT_BASE + (parent -= INT_SCALE));
            } else {
                if ((n -= INT_SCALE) == 0) break;
                t = U.getInt(arr, INT_BASE + n);
                U.putInt(arr, INT_BASE + n, U.getInt(arr, INT_BASE));
            }
            index = parent;
            childI = (index << 1) + INT_SCALE;
            while (childI < n) {
                int childV = U.getInt(arr, INT_BASE + childI);
                int right;
                if (childI + INT_SCALE < n &&
                        (right = U.getInt(arr, INT_BASE + (childI + INT_SCALE))) > childV) {
                    childI += INT_SCALE;
                    childV = right;
                }
                if (childV > t) {
                    U.putInt(arr, INT_BASE + index, childV);
                    index = childI;
                    childI = (index << 1) + INT_SCALE;
                } else {
                    break;
                }
            }
            U.putInt(arr, INT_BASE + index, t);
        }
        return arr[arr.length - 1];
    }

    @OutputTimeUnit(TimeUnit.MICROSECONDS)
    @BenchmarkMode(Mode.AverageTime)
    @Warmup(iterations = 5, time = 1)
    @Measurement(iterations = 10, time = 1)
    @State(Scope.Thread)
    @Threads(1)
    @Fork(1)
    public static class Benchmarks {
        static final int N = 1024;
        int[] a = new int[N];

        @Setup(Level.Invocation)
        public void fill() {
            Random r = ThreadLocalRandom.current();
            for (int i = 0; i < N; i++) {
                a[i] = r.nextInt();
            }
        }

        @GenerateMicroBenchmark
        public static int simple(Benchmarks st) {
            int[] arr = st.a;
            return heapSortSimple(arr);
        }

        @GenerateMicroBenchmark
        public static int unsafe(Benchmarks st) {
            int[] arr = st.a;
            return heapSortUnsafe(arr);
        }
    }

    public static void main(String[] args) {
        Benchmarks bs = new Benchmarks();

        // verify simple sort
        bs.fill();
        int[] a1 = bs.a;
        int[] a2 = a1.clone();
        Arrays.sort(a2);
        heapSortSimple(a1);
        if (!Arrays.equals(a2, a1))
            throw new AssertionError();

        // let JIT to generate optimized assembly
        for (int i = 0; i < 10000; i++) {
            bs.fill();
            heapSortSimple(bs.a);
        }

        // verify unsafe sort
        bs.fill();
        a1 = bs.a;
        a2 = a1.clone();
        Arrays.sort(a2);
        heapSortUnsafe(a1);
        if (!Arrays.equals(a2, a1))
            throw new AssertionError();

        for (int i = 0; i < 10000; i++) {
            bs.fill();
            heapSortUnsafe(bs.a);
        }
    }

    static final Unsafe U;
    static final long INT_BASE;
    static final long INT_SCALE = 4;

    static {
        try {
            Field f = Unsafe.class.getDeclaredField("theUnsafe");
            f.setAccessible(true);
            U = (Unsafe) f.get(null);
        } catch (Exception e) {
            throw new IllegalStateException(e);
        }
        INT_BASE = U.arrayBaseOffset(int[].class);
    }
}

Unsafe version is consistently 13% faster both on Intel SB and AMD K10 CPUs.

I've looked into the generated assembly:

  • lower bound check is eliminated for all indexing operations (1-8)
  • upper bound check is eliminated only for operation 5, checks for 2 and 3 are merged
  • yes, for operation 4 (arr[0]) on each iteration it is checked that arr.length != 0

Obviously all bound check branches are predicted perfecty, that's why heap sort with simple indexing is slower than unsafe only for 13%.

I thaught it definitely JIT's work to optimize bound checks at least for operations 1, 2 and 3, where the index is decremented steadily from some value below array's length to zero.

The question is in the title: why HotSpot JIT does so poor job of bound checks elimination in this case?

leventov
  • 14,760
  • 11
  • 69
  • 98
  • Did you warm up the JVM before running your performance tests? – Emil L Apr 29 '14 at 07:38
  • You found that hotspot eliminated a dozen of checks but did not eliminated one, and how is that "poor job"? – Oleg Estekhin Apr 29 '14 at 07:44
  • @OlegEstekhin it doesn't eliminate 7, 3 of them I thaught it surely should, because data pattern is very simple, like in a loop with decremented counter. Checking the array length is non zero on each iteration (instead of once before the loop) is really silly. – leventov Apr 29 '14 at 07:47
  • Once you get multithreading involved crazy things can happen. For example x==x can be false (even if we ignore NaN!=NaN) because [x can change between the 2 accesses](http://stackoverflow.com/a/20375252/2187042) – Richard Tingle Apr 29 '14 at 07:50
  • @OlegEstekhin right before operations 1, 2 and 3 the index is checked to be positive anyway I counted zero bound checks as eliminated, though it would be nonsense if JIT generate two equal checks one after another. – leventov Apr 29 '14 at 07:50
  • @RichardTingle it is not relevant to my case. 1) it is single threaded 2) array's lengths are immutable in java and should be accessible from any number of threads without "effects" – leventov Apr 29 '14 at 07:57
  • Array lengths are immutable but the reference `arr` isn't (it would be interesting to see what would happen if you declared arr as final) – Richard Tingle Apr 29 '14 at 07:59
  • 1
    @RichardTingle `arr` is local in the sort methods. Adding `final` to local doesn't affect runtime performance at all at least because it isn't reflected in bytecode. In assembly of sorting methods both `arr` address and `arr.lendth` are kept in registers and aren't updated during the sort loop. – leventov Apr 29 '14 at 08:06
  • @leventov Which Java version were you using? Which parameters you used for starting the JVM? – Gábor Bakos Jun 06 '14 at 17:38
  • @GáborBakos the latest Java 8 build at that moment. Parameters - don't remember – leventov Jun 06 '14 at 18:18

1 Answers1

1

I don't think all latices are restricted.

Passing zero length array would result in IOOB. Try having if (n==0) return prior to the loop.

bestsss
  • 11,796
  • 3
  • 53
  • 63