bit - Emulating std::bitset in CUDA -
i have input array given kernel. each thread works 1 value of array , either changes value or doesn't change @ according rule.
i find out afterwards if there change inside input memory and, in case there was, want find change occurred (index of input array).
i thought of using array of bits. total amount of bits equal total amount of threads. each thread manipulate 1 bit, bits set false, if thread changes corresponding input value bit become true.
in order make more clear, let's suppose have input array called a
1 9 3 9 4 5
the array of bits following
0 0 0 0 0 0
so have 6 threads working on input array. let's suppose final input array
1 9 3 9 2 5
so final array of bits be:
0 0 0 0 1 0
i don't want use array of bool
because each of values take 1 byte of memory quite lot since want work using bits.
is possible achieve this?
i thought of creating char
array each value of array have 8 bits. however, if 2 threads change different bits of first character of array? have operation atomically though change inside bit on different locations. using atomic operations disrupt parallelism, , in case using atomic operations not needed, doesn't make sense, have used because of constraints of using array of chars instead of more specialized std::bitset
thank in advance.
i'm providing late answer question remove unanswered list.
to want achieve can define array of unsigned int
s of length n/32
, n
length of arrays comparing. can use atomicadd
write each bit of such array, depending on whether 2 elements of arrays equal or not.
below i'm providing simple example:
#include <iostream> #include <thrust\device_vector.h> __device__ unsigned int __ballot_non_atom(int predicate) { if (predicate != 0) return (1 << (threadidx.x % 32)); else return 0; } __global__ void check_if_equal_elements(float* d_vec1_ptr, float* d_vec2_ptr, unsigned int* d_result, int num_warps_per_block) { int tid = threadidx.x + blockidx.x * blockdim.x; const unsigned int warp_num = threadidx.x >> 5; atomicadd(&d_result[warp_num+blockidx.x*num_warps_per_block],__ballot_non_atom(!(d_vec1_ptr[tid] == d_vec2_ptr[tid]))); } // --- credit "c printing bits": void printbits(unsigned int num){ unsigned int size = sizeof(unsigned int); unsigned int maxpow = 1<<(size*8-1); int i=0; for(;i<size;++i){ for(;i<size*8;++i){ // print last bit , shift left. printf("%u ",num&maxpow ? 1 : 0); num = num<<1; } } } void main(void) { const int n = 64; thrust::device_vector<float> d_vec1(n,1.f); thrust::device_vector<float> d_vec2(n,1.f); d_vec2[3] = 3.f; d_vec2[7] = 4.f; unsigned int num_threads_per_block = 64; unsigned int num_blocks_per_grid = 1; unsigned int num_warps_per_block = num_threads_per_block/32; unsigned int num_warps_per_grid = (num_threads_per_block*num_blocks_per_grid)/32; thrust::device_vector<unsigned int> d_result(num_warps_per_grid,0); check_if_equal_elements<<<num_blocks_per_grid,num_threads_per_block>>>((float*)thrust::raw_pointer_cast(, (float*)thrust::raw_pointer_cast(, (unsigned int*)thrust::raw_pointer_cast(, num_warps_per_block); unsigned int val = d_result[1]; printbits(val); val = d_result[0]; printbits(val); getchar(); }
Post a Comment