I created simple passthrough compute kernel
kernel void filter(texture2d<float, access::read> inTexture [[texture(0)]],
texture2d<float, access::write> outTexture [[texture(1)]],
uint2 gridPos [[ thread_position_in_grid ]]) {
float4 color = inTexture.read(gridPos);
outTexture.write(color, gridPos);
}
Measuring the execution time
[self.timer start];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
CGFloat ms = [self.timer elapse];
Timer class works like this:
- (void)start {
self.startMach = mach_absolute_time();
}
- (CGFloat)elapse {
uint64_t end = mach_absolute_time();
uint64_t elapsed = end - self.startMach;
uint64_t nanosecs = elapsed * self.info.numer / self.info.denom;
uint64_t millisecs = nanosecs / 1000000;
return millisecs;
}
Dispatch call:
static const NSUInteger kGroupSize = 16;
- (MTLSize)threadGroupSize {
return MTLSizeMake(kGroupSize, kGroupSize, 1);
}
- (MTLSize)threadGroupsCount:(MTLSize)threadGroupSize {
return MTLSizeMake(self.provider.texture.width / kGroupSize,
self.provider.texture.height / kGroupSize, 1);
}
[commandEncoder dispatchThreadgroups:threadgroups
threadsPerThreadgroup:threadgroupSize];
gives me 13 ms on 512x512 rgba image and it grows lineary if I perform more passes.
Is this correct? It seems too much overhead for real time application.