I'm working with Cuda 5 on a Tesla C1060 GPU, Compute Capability 1.3, Ubuntu 12.04. In my kernel, each thread computes the values of a (private) local float array locArr[]
.
Then, the value of a float variable var
is computed by using locArr[]
.
__global__ void gpuKernel
(
float *src, float *out,
size_t memPitchAux, int w
)
{
float locArr[256];
float var=0.0f;
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
int idx = tidy * memPitchAux/4 + tidx;
for(int ic=0; ic<256; ic++)
{
locArr[ic] = 0.0f;
}
for(int is=0; is<255; is++)
{
int ic = fabs(src[tidy*w +tidx]);
locArr[ic] += 1.0f;
}
for(int ic=0; ic<255; ic++)
{
var += locArr[ic];
}
out[idx] = var;
}
There are not enough registers per thread, so locArr[]
is spilled out to global memory. The timing for executing this kernel is ~18 ms, but if I multiply locArr[]
by zero then the timing drops to 1.1 ms.
for(int ic=0; ic<255; ic++)
{
var += locArr[ic] * 0.0f;
}
I do not understand why, each thread should anyway read the needed locArr[]
value in global memory and then multiply it by zero. The timing should be the same. Instead it is as if the threads already know it is not necessary to read the data because the output will be anyway zero.
Could anyone explain to me what is going on?
EDIT : if instead I have
for(int ic=0; ic<255; ic++)
{
var += locArr[ic] * locArr2[ic];
}
where locArr2[]
is a local array (spilled to global memory) of zeros, can the optimization be done at runtime?
EDIT 2: my makefile
################################################################################
#
# Makefile project only supported on Mac OSX and Linux Platforms)
#
################################################################################
# OS Name (Linux or Darwin)
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
# Flags to detect 32-bit or 64-bit OS platform
OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/")
OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")
# These flags will override any settings
ifeq ($(i386),1)
OS_SIZE = 32
OS_ARCH = i686
endif
ifeq ($(x86_64),1)
OS_SIZE = 64
OS_ARCH = x86_64
endif
# Flags to detect either a Linux system (linux) or Mac OSX (darwin)
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
# Location of the CUDA Toolkit binaries and libraries
CUDA_PATH ?= /usr/local/cuda-5.0
CUDA_INC_PATH ?= $(CUDA_PATH)/include
CUDA_BIN_PATH ?= $(CUDA_PATH)/bin
ifneq ($(DARWIN),)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
ifeq ($(OS_SIZE),32)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64
endif
endif
# Common binaries
NVCC ?= $(CUDA_BIN_PATH)/nvcc
GCC ?= g++
# Extra user flags
EXTRA_NVCCFLAGS ?=
EXTRA_LDFLAGS ?=
EXTRA_CCFLAGS ?=
# CUDA code generation flags
GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)
# OS-specific build flags
ifneq ($(DARWIN),)
LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -arch $(OS_ARCH)
else
ifeq ($(OS_SIZE),32)
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m32
else
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m64
endif
endif
# OS-architecture specific flags
ifeq ($(OS_SIZE),32)
NVCCFLAGS := -m32
else
NVCCFLAGS := -m64
endif
# OpenGL specific libraries
ifneq ($(DARWIN),)
# Mac OSX specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/darwin -L/System/Library/Frameworks/OpenGL.framework/Libraries -framework GLUT -lGL -lGLU ../../common/lib/darwin/libGLEW.a
else
# Linux specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/linux/$(OS_ARCH) -L/usr/X11R6/lib -lGL -lGLU -lX11 -lXi -lXmu -lglut -lGLEW -lrt
endif
# Debug build flags
ifeq ($(dbg),1)
CCFLAGS += -g
NVCCFLAGS += -g -G
TARGET := debug
else
TARGET := release
endif
# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_INC_PATH) -I. -I.. -I../../common/inc
LDFLAGS += $(LIBPATH_OPENGL)
# Target rules
all: build
build: stackOverflow
stackOverflow.o: stackOverflow.cu
$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
stackOverflow: stackOverflow.o
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
mkdir -p ./bin/$(OSLOWER)/$(TARGET)
cp $@ ./bin/$(OSLOWER)/$(TARGET)
run: build
./stackOverflow
clean:
rm -f stackOverflow.o stackOverflow *.pgm