0

EDIT
Here's a small program you compile to see these kind of errors for yourself...

//for printf
#include <stdio.h>

#include <cuda.h>

__inline __host__ void gpuAssert(cudaError_t code, char *file, int line, 
                 bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
          file, line);
      //if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

__global__ void myKernel1(int *dev_idx, int *dev_tID, const int offset)
{
   int myElement = threadIdx.x + blockDim.x * blockIdx.x;
   //
   int temp;
   temp = myElement+
      offset +
      (offset==0)?0:(offset&0x01==0x0)?(offset-1)*(offset>>1):
      (offset)*(offset>>1);
   dev_idx[myElement+offset] = temp;
   dev_tID[myElement+offset] = myElement;

}

__global__ void myKernel2(int *dev_idx, int *dev_tID, const int offset)
{
   int myElement = threadIdx.x + blockDim.x * blockIdx.x;
   //
   int temp;
   temp = myElement+offset;
   if (offset != 0 && offset&0x01==0x0) temp+= (offset-1)*(offset>>1);
   if (offset != 0 && offset&0x01!=0x0) temp+= offset*( offset>>1);
   dev_idx[myElement+offset] = temp;
   dev_tID[myElement+offset] = myElement;

}

__host__ void PrintMethod1(int *h_idx, int * h_tID, const int offset, 
               const int total, const int h_set)
{
   for (int c=(h_set==0)?0:offset;
    c < (h_set==0)?offset:total;
    c++)
      printf("Element #%d --> idx: %d   tID: %d\n",
         c,h_idx[c],h_tID[c]);
}

__host__ void PrintMethod2(int *h_idx, int * h_tID, const int offset, 
               const int total, const int h_set)
{
   int loopStart = (h_set==0)?0:offset;
   int loopEnd = (h_set==0)?offset:total;
   printf("Loop Start: %d, Loop End: %d\n",
      loopStart, loopEnd);
   for (int c=loopStart; c < loopEnd; c++)
      printf("Element #%d --> idx: %d   tID: %d\n",
         c,h_idx[c],h_tID[c]);
}

//Checks if there is a compatible device
bool IsCompatibleDeviceRunning()
{
   int *dummy;
   return cudaGetDeviceCount(dummy) != cudaSuccess;
}

int main()
{
   //Check for compatible device
   if (!IsCompatibleDeviceRunning())
   {
      printf("ERROR: No compatible CUDA devices found!\n");
      exit(1);
   }
   const int total = 30;
   const int offset = total/2;

   int * h_tID, * dev_tID, * h_idx, * dev_idx, h_set;
   h_tID = (int *) malloc(total*sizeof(int));
   h_idx = (int *) malloc(total*sizeof(int));
   gpuErrchk(cudaMalloc((void **) &dev_tID,total*sizeof(int)));
   gpuErrchk(cudaMalloc((void **) &dev_idx,total*sizeof(int)));
   myKernel1<<<1,offset>>>(dev_idx, dev_tID, 0);
   //myKernel2<<<1,offset>>>(dev_idx, dev_tID, 0);
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());
   myKernel1<<<1,offset>>>(dev_idx, dev_tID, offset);
   //myKernel2<<<1,offset>>>(dev_idx, dev_tID, offset);
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());
   gpuErrchk(cudaMemcpy(h_tID, dev_tID, total*sizeof(int),
            cudaMemcpyDeviceToHost));
   gpuErrchk(cudaMemcpy(h_idx, dev_idx, total*sizeof(int),
            cudaMemcpyDeviceToHost));
   h_set = 0;
   //PrintMethod1(h_idx, h_tID, offset, total, h_set);
   PrintMethod2(h_idx, h_tID, offset, total, h_set);
   h_set = 1;
   //PrintMethod1(h_idx, h_tID, offset, total, h_set);
   PrintMethod2(h_idx, h_tID, offset, total, h_set);
   return 0;
}

When MyKernel2 is run, the correct output is written to the array:

Loop Start: 0, Loop End: 15
Element #0 --> idx: 0   tID: 0
Element #1 --> idx: 1   tID: 1
Element #2 --> idx: 2   tID: 2
Element #3 --> idx: 3   tID: 3
Element #4 --> idx: 4   tID: 4
Element #5 --> idx: 5   tID: 5
Element #6 --> idx: 6   tID: 6
Element #7 --> idx: 7   tID: 7
Element #8 --> idx: 8   tID: 8
Element #9 --> idx: 9   tID: 9
Element #10 --> idx: 10   tID: 10
Element #11 --> idx: 11   tID: 11
Element #12 --> idx: 12   tID: 12
Element #13 --> idx: 13   tID: 13
Element #14 --> idx: 14   tID: 14
Loop Start: 15, Loop End: 30
Element #15 --> idx: 120   tID: 0
Element #16 --> idx: 121   tID: 1
Element #17 --> idx: 122   tID: 2
Element #18 --> idx: 123   tID: 3
Element #19 --> idx: 124   tID: 4
Element #20 --> idx: 125   tID: 5
Element #21 --> idx: 126   tID: 6
Element #22 --> idx: 127   tID: 7
Element #23 --> idx: 128   tID: 8
Element #24 --> idx: 129   tID: 9
Element #25 --> idx: 130   tID: 10
Element #26 --> idx: 131   tID: 11
Element #27 --> idx: 132   tID: 12
Element #28 --> idx: 133   tID: 13
Element #29 --> idx: 134   tID: 14

When MyKernel1 is run, with an identical ternary-based idx assignment, it gets zero for all results:

Loop Start: 0, Loop End: 15
Element #0 --> idx: 0   tID: 0
Element #1 --> idx: 0   tID: 1
Element #2 --> idx: 0   tID: 2
Element #3 --> idx: 0   tID: 3
Element #4 --> idx: 0   tID: 4
Element #5 --> idx: 0   tID: 5
Element #6 --> idx: 0   tID: 6
Element #7 --> idx: 0   tID: 7
Element #8 --> idx: 0   tID: 8
Element #9 --> idx: 0   tID: 9
Element #10 --> idx: 0   tID: 10
Element #11 --> idx: 0   tID: 11
Element #12 --> idx: 0   tID: 12
Element #13 --> idx: 0   tID: 13
Element #14 --> idx: 0   tID: 14
Loop Start: 15, Loop End: 30
Element #15 --> idx: 0   tID: 0
Element #16 --> idx: 0   tID: 1
Element #17 --> idx: 0   tID: 2
Element #18 --> idx: 0   tID: 3
Element #19 --> idx: 0   tID: 4
Element #20 --> idx: 0   tID: 5
Element #21 --> idx: 0   tID: 6
Element #22 --> idx: 0   tID: 7
Element #23 --> idx: 0   tID: 8
Element #24 --> idx: 0   tID: 9
Element #25 --> idx: 0   tID: 10
Element #26 --> idx: 0   tID: 11
Element #27 --> idx: 0   tID: 12
Element #28 --> idx: 0   tID: 13
Element #29 --> idx: 0   tID: 14

When PrintMethod1 (with the ternary bounding) is run, it segfaults, essentially getting stuck in an infinite loop. Note, this is on the host side!!

When PrintMethod2 is run, the output prints normally is as expected above.

Here is my compile command:

nvcc --compiler-options -fno-strict-aliasing -DUNIX -m64 -O2 \
--compiler-bindir /usr/bin/g++ \
-gencode=arch=compute_20,code=\"sm_21,compute_20\" \
-I/usr/local/CUDA_SDK/C/common/inc -I/usr/local/CUDA_SDK/shared/inc \
-o TEST Test.cu

About the only clue I have is that it's complaining about both kernels have an improper parameter, although it looks correct and gets the correct results for MyKernel2.

I think the above example is pretty much what commenters could have tried on their own based on the below description, but it saves you the time and effort of writing code!

Let me know if there's anything else I can post, to help figure this out.

Original Question

Most C compilers, as defined by the lang. standard support ternary operators.

e.g.

int myVar;
myVar=(testFlg==true)?-1:1;

However, surprisingly CUDA's nvcc appears to strip away some ternary operators and replace them with zeroes when they are used within a kernel...

I discovered this by applying cuPrintf to check a problem block of code. For example, let's say I have two kernels sharing a global array for their output. The first kernel deals with the first chunk of elements. The second kernel gets an offset to indicate how far to jump in the array so as not to overwrite the first kernel's elements. The offset is different for even and odd.

So I could write:

if (krnl!=0 && offset&0x01==0x0)
   idx+=(offset-1)*(offset>>1);
if (krnl!=0 && offset&0x01!=0x0)
   idx+=offset*(offset>>1);

But it would be more compact and readable (in my opinion) to write the near-equivalent shorthand syntax.

idx += (krnl==0)?0:(offset&0x01==0)?
   (offset-1)*(offset>>1):
   offset*(offset>>1);

The latter code, though will always produce a zero, as CUDA's compiler snips out the shorthand conditionals.

I realize this feature code be abused and cause thread divergence, but in simple cases it does not seem like it would be any different from standard conditionals, if the compiler handled it properly.

Is this a bug in the compiler or is it intentionally not supported?

Does anyone know if this feature is coming to CUDA?

I was quite surprised to find out that was the source of my addressing failures and segfaults...

EDIT
This is a standard C feature, I misread and erroneously said it was non-standard.

EDIT 2
I had said "chokes and dies" for the compiler. "Dies" was definitely inappropriate terminology to use. Rather, nvcc completes the compilation, but apparently has stripped away the ternary operator-based assignment and replaced it with zero. This would later come back and bite me as stuff was not getting written to the proper spots, and those spots were in turn used as indices in a double-indexed scheme. The indices were used during the wrapup on the CPU side, hence the segfault occurred on the CPU side, but was driven by compiler snipping.

I'm using compiler v4.1 and have -O2 turned on. It appears that the optimizer may be optimizing out the variable that is used inside the ternary operation, which may be the source of this bug.

The error-prone ternary operation is near-identical to the example I gave above, but is involved in a large addition operation.

I plan on following the advice of the below commenter and filing a bug report with NVIDIA, but am leaving this post as a warning to others.

Edit 3

Here a slightly sanitized full statement that's always yielding zero:

__global__ void MyFunc
( const int offset
  const CustomType * dev_P,
  ...
  const int box)
{
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
   int idx=0;
   ...
   idx = tidx +
      dev_P->B +
      (box == 0)?0:(offset&0x01!=0x0):
      (offset-1)*(offset>>1):offset*(offset>>1);
   //NOTES:
   //I put the cuPrintf here.... from it I could see that tidx was diff. ints (as you 
   //would expect), but that when added together the sum was always "magically"
   //becoming zero.  The culprit was the nested ternary operator.
   //Once I replaced it with the equivalent conditional, the assignment worked as
   //expected.
   //"offset" is constant on the level of this kernel, but it is not always 0.
   //Outside the kernel "offset" varies greatly over the course of the simulation,
   //meaning that each time the kernel is called, it likely has a different value.
   //"tidx" obviously varies.
   //but somehow the above sum gave 0, likely due to an unreported compiler bug.
   //box is either 0 or 1.  For a certain type of op in my simulation I call this 
   //kernel twice, once for box value 0 and a second time for box value 1
   ...
}
Jason R. Mick
  • 5,177
  • 4
  • 40
  • 69
  • `...?...:...` is GNU C, not C++, so I'm not following what `nvvc`'s C++ frontend has to do with its support or lack thereof of C features. However, the basis of the C-compiler would... http://www.gnu.org/software/gnu-c-manual/gnu-c-manual.html#Conditional-Expressions – Jason R. Mick Mar 15 '12 at 18:05
  • I'm not sure what you're talking about this being a GNU extension - `a ? b : c` is the C ternary operator (http://en.wikipedia.org/wiki/Ternary_operation) and is completely, totally, universally standard in C and C-derived languages. GNU does have a funny extension of this where you can omit `b` and it will return the expression `a` in its place (so, eg, you're returning a unless a is zero, in which case you return c) but as far as I can see you're not using that. – Jonathan Dursi Mar 15 '12 at 19:01
  • @Jonathan... Oh, I thought based on my reading that this was non-standard, must have mis-read. But yea, that indicates this is likely a compiler bug (I would assume)? – Jason R. Mick Mar 15 '12 at 19:23
  • @Jonathan, thanks, I've fixed my description of this problem. – Jason R. Mick Mar 15 '12 at 19:26
  • 1
    Why not post more complete code? In my experience ternary operator works fine in CUDA C/C++. The compiler does perform dead code elimination. So, for example if it determines that `krnl` is a constant the code would be simplified -- especially if `krnl` is assigned zero. What is your code to initialize `krnl`? – harrism Mar 16 '12 at 01:48
  • I can't post the full code as it is currently closed source, but I'll try to sanitize it and post as big an excerpt as I can later. – Jason R. Mick Mar 16 '12 at 03:44
  • The ternary operator you have shown is standard C and C++. OTOH a ternary operator with omitted operand is a GCC extension. Example: x = a ? : b; This may not be supported by CUDA. – Ashwin Nanjappa Mar 16 '12 at 04:16
  • @Ashwin Yes, I corrected this in the edited version, see above... I was surprised too, I'm 90 percent sure this is some sort of optimization bug, because when I put the exact same conditional below it fixed it... – Jason R. Mick Mar 16 '12 at 04:47
  • @harrism I put the code above, with the vars renamed, and the rest of the kernel snipped. Again, I could tell something odd was going on because `tidx` was *almost never* zero (it's only zero for the first thread of the kernel, but the sum of it being added to the ternary operator result was **ALWAYS** 0, as confirmed by both `cuPrintf` calls and by `cudaMemcpy`ing the result off the device... Once I replaced the ternary operator (by splitting into the add of the first two terms, than conditional adds to replace the ternary) this oddity disappeared, thankfully. – Jason R. Mick Mar 16 '12 at 05:04
  • Doesn't help, since you haven't specified the values of `Box` or `offset`... – harrism Mar 16 '12 at 05:28
  • @harrism... `Box` is a passed param to the kernel function, either 1 or 0, i'll add that to the params list, forgot that in the snipped description. `offset` is a passed param to the kernel, it varies throughout the run -- it can be anything from 128 to 150,000 depending on how I set the simulation up and how long it runs. It is only constant within that kernel. Hope that helps. – Jason R. Mick Mar 16 '12 at 05:34
  • Logic-wise, basically the length of the first set of indices (the ternary block based on `offset`) is applied only if you're in the second `box`, as the first set uses the first stretch of available indices. – Jason R. Mick Mar 16 '12 at 05:35
  • (Second being denoted by a `box` value of `1`, I should say... see revised comment for more details.) – Jason R. Mick Mar 16 '12 at 05:39
  • Can't you write a simple, self contained kernel which reproduces the problem? It is very difficult to diagnose a potential compiler problem without real code to compile. Also, have you tried looking at the emitted PTX from the compiler and disassembled assembler output to see what is winding up on the device. So far this question seems to be nothing more than poorly described supposition and a lot of edits and hand waving..... – talonmies Mar 16 '12 at 09:52
  • Thanks for the feedback talonmies. Will try to post a full code later. – Jason R. Mick Mar 16 '12 at 16:27
  • @talonmies Posted an example code that fails... check it out!! Thank you. – Jason R. Mick Mar 18 '12 at 23:17
  • @harrism, please see the newly posted example code, it may be helpful... – Jason R. Mick Mar 18 '12 at 23:17
  • Jason, C/C++ relational operators [have higher precedence than bitwise operators](http://en.cppreference.com/w/cpp/language/operator_precedence). Therefore your expression `offset&0x01==0x0` always evaluates to zero. I suspect that is not what you intend. – harrism Mar 18 '12 at 23:37
  • @harrism Good catch! Ahhhh I did not realize that... but that still doesn't explain why the full statement always at `0`... if that statement always evaluates for zero, it should still be adding an offset, as far as I can see (albeit an improper one)... e.g. `(offset==0)?0:...` should correctly enter the `...` clause, right? I still don't see how that statement evaluates to `0` consistently, even if it is incorrect in terms of order of op within the second clause. – Jason R. Mick Mar 18 '12 at 23:42
  • Seems like it should consistently evaluate to `myElement+offset+(offset)*(offset>>1)` (incorrect, but non-zero) if `offset!=0` or else `myElement+offset` if `offset==0` (correct and non-zero... how a consistent `0` pops out is baffling based on my current knowledge. – Jason R. Mick Mar 18 '12 at 23:45
  • I agree. I still think a simpler repro could be made... – harrism Mar 18 '12 at 23:51
  • @harrism Sorry it was long, I hear you. I basically just wanted to show both that it was failing on the `host` side (in the for loop) and on the `device`. I'm wondering if this error is endemic to all of gnu.. will test a non-CUDA program. – Jason R. Mick Mar 18 '12 at 23:54
  • @harrism et al. Ahh, the answer is here... http://stackoverflow.com/questions/7499400/c-ternary-conditional-and-assignment-operator-precedence First to post gets a freebie. :) – Jason R. Mick Mar 19 '12 at 00:03
  • Wait... actually that's about ternaries on the LHS... a bit diff. – Jason R. Mick Mar 19 '12 at 00:06
  • I knew it was all down to precedence. That's why I gave you the hint about the obvious precedence error in your code (I just didn't have the time to find all of them). This is why the code you are writing is so dangerous: ternary operators (and especially nested ones) tend to obfuscate the meaning of the code. Write that code in multiple lines, and use if/else instead, and it will be much cleaner, less brittle, and more maintainable. – harrism Mar 19 '12 at 01:13

1 Answers1

1

I found the answer out... this is a general C issue, not CUDA-specific.

The ternary operator has a very low precedence, both on the LHS and RHS (strangely different precedence for each, though).

However, the precedence could be overriden via encapsulating the entire ternary in parentheses, e.g. ((...)?...:...).

I forked a general question about the common sense of adopting this approach for a language standard here:
Unexpected Result, Ternary Operator in Gnu C

Community
  • 1
  • 1
Jason R. Mick
  • 5,177
  • 4
  • 40
  • 69