How to apply a tensorflow function on tensor slices efficiently?

Question

I have a tensor of, say, a batch of grayscale images (so a BxHxW tensor).

I would like to slice the tensor along H and W into N and M, respectively, evenly spaced (more or less even) windows and apply tf.math.bincount to each window. <-- Sorry for this slightly convoluted English.

Below is what I currently do, but it is slow to build and creates quite a bloated tensorflow graph. Is there a better way to do this?

def makeHistBins(inPut, N, M=None):
    if M is None:
        M = N
    LEN = int(inPut.shape[0])
    ImgH = int(inPut.shape[1])
    ImgW = int(inPut.shape[2])
    HistH = math.ceil(ImgH/N)
    HistW = math.ceil(ImgW/M)
    for k in range(LEN):
        for i in range(N):
            for j in range(M):
                hist = tf.reshape(tf.math.bincount(inPut[k, math.floor(ImgH*i/N):math.ceil(ImgH*(i+1)/N), math.floor(ImgW*j/M):math.ceil(ImgW*(j+1)/M)], minlength=256), [256])
                if j == 0:
                    row = tf.expand_dims(hist, 0)
                else:
                    row = tf.concat([row, tf.expand_dims(hist, 0)], 0)
            if i == 0:
                col = tf.expand_dims(row, 0)
            else:
                col = tf.concat([col, tf.expand_dims(row, 0)], 0)
        if k == 0:
            out = tf.expand_dims(col, 0)
        else:
            out = tf.concat([out, tf.expand_dims(col, 0)], 0)
    return out

Example

# Images is 10x100x100
Out = makeHistBins(Images, 5)
# Out should be 10x5x5x256, where bincount was applied to 20x20 windows along each image.

According to your code, the output should be `10x5x5x256` instead of `10x5x5`? — giser_yugang, Jan 13 '20 at 03:22
@giser_yugang, you are correct, thanks. I updated the example — AOK, Jan 13 '20 at 08:11

giser_yugang · Answer 1 · 2020-01-21T23:43:32.200

You should use tf.extract_image_patches.This op collects patches from the input image, as if applying a convolution. It supports window sliding and overlapping.

def imagemakeHistBins(inPut, N, M=None):
    if M is None:
        M = N
    B, H, W = inPut.get_shape().as_list()
    HistH = math.ceil(H / N)
    HistW = math.ceil(W / M)

    inPut = tf.pad(inPut+1,[[0,0],[math.ceil((HistH*N-H)/2),math.floor((HistH*N-H)/2)],[math.ceil((HistW*M-W)/2),math.floor((HistW*M-W)/2)]])

    # extract inPut to (-1,N,M,HistH*HistW)
    # the shape of inPut_new is (10,5,5,400) if inPut is (10,100,100) and M=N=5
    # the shape of inPut_new is (10,6,6,289) if inPut is (10,100,100) and M=N=6
    # ...
    inPut_new = tf.extract_image_patches(images=tf.expand_dims(inPut, axis=-1),
                               ksizes=[1, HistH, HistW, 1],
                               strides=[1, HistH, HistW, 1],
                               rates=[1, 1, 1, 1],
                               padding='VALID')

    # following codes are referenced from https://stackoverflow.com/a/50883668
    max_arr_plus = tf.reduce_max(inPut_new) + 1
    new_shape = tf.shape(inPut_new)
    group = max_arr_plus * tf.reshape(tf.range(new_shape[0] * new_shape[1]*new_shape[2]),
                                      [new_shape[0], new_shape[1],new_shape[2]])

    ids = inPut_new + group[:, :, :, None]
    out = tf.bincount(ids, minlength=max_arr_plus*new_shape[0]*new_shape[1]*new_shape[2])
    out = tf.reshape(out,[new_shape[0], new_shape[1],new_shape[2],max_arr_plus])
    return out[:,:,:,1:]

Experimental comparison

You can compare the time consumption of three methods at the same time.

import tensorflow as tf
import math
import time
import numpy as np

def makeHistBins(inPut, N, M=None):
    if M is None:
        M = N
    LEN = int(inPut.shape[0])
    ImgH = int(inPut.shape[1])
    ImgW = int(inPut.shape[2])
    HistH = math.ceil(ImgH/N)
    HistW = math.ceil(ImgW/M)
    for k in range(LEN):
        for i in range(N):
            for j in range(M):
                hist = tf.reshape(tf.math.bincount(inPut[k, math.floor(ImgH*i/N):math.ceil(ImgH*(i+1)/N), math.floor(ImgW*j/M):math.ceil(ImgW*(j+1)/M)], minlength=256), [256])
                if j == 0:
                    row = tf.expand_dims(hist, 0)
                else:
                    row = tf.concat([row, tf.expand_dims(hist, 0)], 0)
            if i == 0:
                col = tf.expand_dims(row, 0)
            else:
                col = tf.concat([col, tf.expand_dims(row, 0)], 0)
        if k == 0:
            out = tf.expand_dims(col, 0)
        else:
            out = tf.concat([out, tf.expand_dims(col, 0)], 0)
    return out

def makeHistBins2(inPut, N, M=None):
    if M is None:
        M = N
    LEN = int(inPut.shape[0])
    ImgH = int(inPut.shape[1])
    ImgW = int(inPut.shape[2])
    HistH = math.ceil(ImgH/N)
    HistW = math.ceil(ImgW/M)

    splits_h = [int(np.floor((ImgH*(i+1))/N)- np.floor((ImgH*i)/N)) for i in range(N)]
    splits_w = [int(np.floor((ImgW*(i+1))/M)- np.floor((ImgW*i)/M)) for i in range(M)]

    # splits_h = np.array([ImgH // N for _ in range(N)])
    # if ImgH - sum(splits_h) > 0:
    #     splits_h[:(ImgH - sum(splits_h))] += 1
    # 
    # splits_w = np.array([ImgW // M for _ in range(M)])
    # if ImgW - sum(splits_w) > 0:
    #     splits_w[:ImgW - sum(splits_w)] += 1

    # print(splits_w)
    out = tf.map_fn(
        lambda x: tf.stack(
            [tf.math.bincount(tf.reshape(z, [-1]), minlength=256, maxlength=256) for y in tf.split(x, splits_h, axis=0) for z in tf.split(y, splits_w, axis=1) ]
            ), inPut)
    return tf.reshape(out, [-1, len(splits_h), len(splits_w), 256])

def imagemakeHistBins(inPut, N, M=None):
    if M is None:
        M = N
    B, H, W = inPut.get_shape().as_list()
    HistH = math.ceil(H / N)
    HistW = math.ceil(W / M)

    inPut = tf.pad(inPut+1,[[0,0],[0,HistH*N-H],[0,HistW*M-H]])

    # extract inPut to (-1,N,M,HistH*HistW)
    # the shape of inPut_new is (10,5,5,400) if inPut is (10,100,100) and M=N=5
    # the shape of inPut_new is (10,6,6,289) if inPut is (10,100,100) and M=N=6
    # ...
    inPut_new = tf.extract_image_patches(images=tf.expand_dims(inPut, axis=-1),
                               ksizes=[1, HistH, HistW, 1],
                               strides=[1, HistH, HistW, 1],
                               rates=[1, 1, 1, 1],
                               padding='VALID')

    # following codes are referenced from https://stackoverflow.com/a/50883668
    max_arr_plus = tf.reduce_max(inPut_new) + 1
    new_shape = tf.shape(inPut_new)
    group = max_arr_plus * tf.reshape(tf.range(new_shape[0] * new_shape[1]*new_shape[2]),
                                      [new_shape[0], new_shape[1],new_shape[2]])

    ids = inPut_new + group[:, :, :, None]
    out = tf.bincount(ids, minlength=max_arr_plus*new_shape[0]*new_shape[1]*new_shape[2])
    out = tf.reshape(out,[new_shape[0], new_shape[1],new_shape[2],max_arr_plus])
    return out[:,:,:,1:]

Images =tf.placeholder(shape=(10,100,100),dtype=tf.int32)
out1 = makeHistBins(Images, 5)
out2 = makeHistBins2(Images, 5)
out3 = imagemakeHistBins(Images, 5)

with tf.Session() as sess:
    example = np.random.randint(0, 256, size=(10, 100, 100))
    value1,value2,value3 = sess.run([out1,out2,out3], feed_dict={Images: example})
    print(np.all(value1 == value2))
    print(np.all(value1 == value3))

    time1 = time.clock()
    for _ in range(10):
        sess.run(out1,feed_dict={Images:example})
    time2 = time.clock()
    print('makeHistBins cost time(ms) : %.2f' % ((time2 - time1)*1000))

    time1 = time.clock()
    for _ in range(10):
        sess.run(out2, feed_dict={Images: example})
    time2 = time.clock()
    print('makeHistBins2 cost time(ms) : %.2f' % ((time2 - time1) * 1000))

    time1 = time.clock()
    for _ in range(10):
        sess.run(out3, feed_dict={Images: example})
    time2 = time.clock()
    print('imagemakeHistBins cost time(ms) : %.2f' % ((time2 - time1) * 1000))

The result:

# True
# True
# makeHistBins cost time(ms) : 1187.79
# makeHistBins2 cost time(ms) : 568.32
# imagemakeHistBins cost time(ms) : 74.57

The operation time of imagemakeHistBins is reduced by an order of magnitude compared with that of imagemakeHistBins and makeHistBins2.

Notes

Please also pay attention to your extraction strategy.

According to your logic, the blocking strategy is as follows:

ImgH = 100
N=6
start_h = [math.floor(ImgH*i/N) for i in range(N)] # [0, 16, 33, 50, 66, 83]
end_h = [math.ceil(ImgH*(i+1)/N) for i in range(N)] # [17, 34, 50, 67, 84, 100]
# [0:17, 16:34, 33:50, 50:67, 66:84, 83:100]

Such a block cannot be supported by both tf.split and tf.extract_image_patches.

splits_h = np.array([ImgH // N for _ in range(N)])
if ImgH - sum(splits_h) > 0:
    splits_h[:(ImgH - sum(splits_h))] += 1
# [17 17 17 17 16 16]

So my strategy is to fill the last block with 0. Because I take the result from 1 to MAX(256) + 1 by out[:,:,:,1:], it will not affect the correctness of the final result.

# tf.extract_image_patches
# [17 17 17 17 17 15]

Even solution

def imagemakeHistBins(inPut, N, M=None):
    if M is None:
        M = N
    B, H, W = inPut.get_shape().as_list()

    splits_h = np.array([H // N for _ in range(N)])
    if H - sum(splits_h) > 0:
        splits_h[:(H - sum(splits_h))] += 1
    splits_h = [0] + splits_h.tolist()
    splits_h = np.cumsum(splits_h)

    splits_w = np.array([W // M for _ in range(M)])
    if W - sum(splits_w) > 0:
        splits_w[:(W - sum(splits_w))] += 1
    splits_w = [0] + splits_w.tolist()
    splits_w = np.cumsum(splits_w)
    # splits_w=[  0  17  34  51  68  84 100] if inPut is (10,100,100) and M=N=6
    print(splits_w)

    inPut_new = []
    for i in range(N):
        for j in range(M):
            hist = inPut[:, splits_h[i]:splits_h[i + 1], splits_w[j]:splits_w[j + 1]]
            hist = tf.pad(hist+1, [[0, 0],
                                 [0, splits_h[1] - splits_h[i + 1] + splits_h[i]],
                                 [0, splits_w[1] - splits_w[j + 1] + splits_w[j]]])
            inPut_new.append(hist)

    inPut_new = tf.reshape(tf.transpose(tf.stack(inPut_new), [1, 0, 2, 3]), [-1, N, M, splits_h[1] * splits_w[1]])

    # following codes are referenced from https://stackoverflow.com/a/50883668
    max_arr_plus = tf.reduce_max(inPut_new) + 1
    new_shape = tf.shape(inPut_new)
    group = max_arr_plus * tf.reshape(tf.range(new_shape[0] * new_shape[1]*new_shape[2]),
                                      [new_shape[0], new_shape[1],new_shape[2]])

    ids = inPut_new + group[:, :, :, None]
    out = tf.bincount(ids, minlength=max_arr_plus*new_shape[0]*new_shape[1]*new_shape[2])
    out = tf.reshape(out,[new_shape[0], new_shape[1],new_shape[2],max_arr_plus])
    return out[:,:,:,1:]

Thanks for your reply. Your recommendation does not produce expected output already at the shape requirement - trying several examples will demonstrate this (specifically, try ones where N/M don't evenly divide the shape, unlike in your example). I tried to fix the shape problem but could not. Furthermore, note that it is important that all output is of shape `[LEN, N, M, 256]`, which is not ensured by your proposed function. I would definitely like to make use of this function due to the time cost, but I can't make it work. — AOK, Jan 20 '20 at 15:36
@AOK I've modified my function to support output `[LEN, N, M, 256]` when N/M don't evenly divide the shape. Please also see what I added. — giser_yugang, Jan 21 '20 at 03:09
In the notes you comment "Such a block cannot be supported...". Agreed, as such overlapping is not supported by either methods - which is not so bad. The only downside to using `tf.extract_image_patches` is that the split can be very uneven, with the bias affecting the last patches by rows/cols - this may not be too problematic for my purpose, but it could be very problematic in general when the division remainder is small (if N/M are large enough, then this can be a real problem). There is, however, a significant speed boost. I will take a closer look at the results and circle back. — AOK, Jan 21 '20 at 14:10
I offered an edit that that corrects a typo and also helps with the uneven splitting by padding on both sides, instead of just one. — AOK, Jan 21 '20 at 14:32

score 1 · Accepted Answer · edited Jan 13 '20 at 11:04

The following would work. But make sure that you specify minlength argument as well, because if the returned bin counts are different, you can't stack them. So basically we are doing two for loops (1 to split height and another to split width) and then use map_fn to iterate over rows, which gives you a 10, 20, 20, 256 matrix (as @giser_yugang pointed out, the output has a 256 dimension at the end). And it requires a bit of work to get this to work with uneven splits.

def makeHistBins(inPut, N, M=None):
    if M is None:
        M = N
    LEN = int(inPut.shape[0])
    ImgH = int(inPut.shape[1])
    ImgW = int(inPut.shape[2])
    #HistH = math.ceil(ImgH/N)
    #HistW = math.ceil(ImgW/M)

    splits_h = np.array([ImgH//N for _ in range(N)])
    if  ImgH - sum(splits_h) > 0:
      splits_h[:(ImgH - sum(splits_h))] += 1

    splits_w = np.array([ImgW//M for _ in range(M)])
    if ImgW - sum(splits_w) > 0:
      splits_w[:ImgW - sum(splits_w)] += 1

    print(splits_w)
    out = tf.map_fn(
        lambda x: tf.stack(
            [tf.math.bincount(tf.reshape(z, [-1]), minlength=256, maxlength=256) for y in tf.split(x, splits_h, axis=0) for z in tf.split(y, splits_w, axis=1) ]
            ), inPut)
    return tf.reshape(out, [LEN, N, M, 256])

Thank you, I think the use of `map_fn`, `split`, and `stack` is the way to go - it speeds up the code significantly. But I edited your post to do what is actually requested (so that the output is BxNxMx256). One difference is that in my original question the windows can overlap, but this is minor issue that can be disregarded. I will just check that the output is indeed what I want and then accept. — AOK, Jan 13 '20 at 08:40

How to apply a tensorflow function on tensor slices efficiently?

2 Answers2