Using cl_float3 in parallel reduction example opencl -
i adapted parallel reduction example opencl bunch of floats. wanted expand code include cl_float3. want find minimum among array of cl_float3. thought straight forward expansion float float3 in kernel. receiving garbage values when return kernel. below kernel:
__kernel void pmin3(__global float3 *src, __global float3 *gmin, __local float3 *lmin, __global float *dbg, uint nitems, uint dev) { uint count = nitems / get_global_size(0); uint idx = (dev == 0) ? get_global_id(0) * count : get_global_id(0); uint stride = (dev == 0) ? 1 : get_global_size(0); // private min work-item float3 pmin = (float3)(pow(2.0,32.0)-1,pow(2.0,32.0)-1,pow(2.0,32.0)-1); (int n = 0; n < count; n++, idx += stride) { pmin.x = min(pmin.x,src[idx].x); pmin.y = min(pmin.y,src[idx].y); pmin.z = min(pmin.z,src[idx].z); } // reduce values within work-group local memory barrier(clk_local_mem_fence); if (get_local_id(0) == 0) lmin[0] = (float3)(pow(2.0,32.0)-1,pow(2.0,32.0)-1,pow(2.0,32.0)-1); (int n = 0; n < get_local_size(0); n++) { barrier(clk_local_mem_fence); if (get_local_id(0) == n) { lmin[0].x = min(lmin[0].x,pmin.x); lmin[0].y = min(lmin[0].y,pmin.y); lmin[0].z = min(lmin[0].z,pmin.z); } } barrier(clk_local_mem_fence); // write __global gmin contain work-group minima if (get_local_id(0) == 0) gmin[get_group_id(0)] = lmin[0]; // collect debug information if (get_global_id(0) == 0) { dbg[0] = get_num_groups(0); dbg[1] = get_global_size(0); dbg[2] = count; dbg[3] = stride; } } __kernel void min_reduce3( __global float3 *gmin) { (int n = 0; n < get_global_size(0); n++) { barrier(clk_global_mem_fence); if (get_global_id(0) == n) { gmin[0].x = min(gmin[0].x,gmin[n].x); gmin[0].y = min(gmin[0].y,gmin[n].y); gmin[0].z = min(gmin[0].z,gmin[n].z); } } barrier(clk_global_mem_fence); }
i think problem get_global_id(0) , get_global_size() gives entire size instead of number of rows given. suggestions?
as others mentioned, float3
(and other type3
types) behave float4 (and other type4
types) purposes of size , alignment. seen using built-in vec_step
function, returns number of elements in input object's type, returns 4 type3
objects.
if host code generates packed float3
array - each object taking size , alignment of 3 floats - proper way use opencl is:
- use
float*
parameter instead offloat3*
- load data using
vload3
- store data using
vstore3
Comments
Post a Comment