#define workUnits 128 #define workUnitsM1 127 typedef struct tag_my_struct { float4 v; } Row; __kernel void x1_search_kernel(int totalRows, __global Row *data, __global Row *resultArray, __global int *roffsetResult) { int i = get_global_id(0); size_t offset = i * (totalRows/workUnits); size_t endRow = (totalRows/workUnits); size_t roffset = offset; if (i == workUnitsM1) { endRow = (totalRows/workUnits) + (totalRows % workUnits); } do { if (data[offset].v.s1 == 0.0f || data[offset].v.s2 == 0.0f || data[offset].v.s3 == 0.0f) goto copy; goto next; copy: resultArray[roffset].v = data[offset].v; roffset++; next: offset++; endRow--; } while (endRow); roffsetResult[i] = roffset- (i * (totalRows/workUnits)); }