__global__ voidreduceGmem(int * g_idata,int * g_odata,unsignedint n) { //set thread ID unsignedint tid = threadIdx.x; unsignedint idx = blockDim.x*blockIdx.x+threadIdx.x; //boundary check if (tid >= n) return; //convert global data pointer to the int *idata = g_idata + blockIdx.x*blockDim.x;
//in-place reduction in global memory if(blockDim.x>=1024 && tid <512) idata[tid]+=idata[tid+512]; __syncthreads(); if(blockDim.x>=512 && tid <256) idata[tid]+=idata[tid+256]; __syncthreads(); if(blockDim.x>=256 && tid <128) idata[tid]+=idata[tid+128]; __syncthreads(); if(blockDim.x>=128 && tid <64) idata[tid]+=idata[tid+64]; __syncthreads(); //write result for this block to global mem if(tid<32) { volatileint *vmem = idata; vmem[tid]+=vmem[tid+32]; vmem[tid]+=vmem[tid+16]; vmem[tid]+=vmem[tid+8]; vmem[tid]+=vmem[tid+4]; vmem[tid]+=vmem[tid+2]; vmem[tid]+=vmem[tid+1];
__global__ voidreduceSmem(int * g_idata,int * g_odata,unsignedint n) { //set thread ID __shared__ int smem[DIM]; unsignedint tid = threadIdx.x; //unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x; //boundary check if (tid >= n) return; //convert global data pointer to the int *idata = g_idata + blockIdx.x*blockDim.x;
smem[tid]=idata[tid]; __syncthreads(); //in-place reduction in global memory if(blockDim.x>=1024 && tid <512) smem[tid]+=smem[tid+512]; __syncthreads(); if(blockDim.x>=512 && tid <256) smem[tid]+=smem[tid+256]; __syncthreads(); if(blockDim.x>=256 && tid <128) smem[tid]+=smem[tid+128]; __syncthreads(); if(blockDim.x>=128 && tid <64) smem[tid]+=smem[tid+64]; __syncthreads(); //write result for this block to global mem if(tid<32) { volatileint *vsmem = smem; vsmem[tid]+=vsmem[tid+32]; vsmem[tid]+=vsmem[tid+16]; vsmem[tid]+=vsmem[tid+8]; vsmem[tid]+=vsmem[tid+4]; vsmem[tid]+=vsmem[tid+2]; vsmem[tid]+=vsmem[tid+1];
__global__ voidreduceUnroll4Smem(int * g_idata,int * g_odata,unsignedint n) { //set thread ID __shared__ int smem[DIM]; unsignedint tid = threadIdx.x; unsignedint idx = blockDim.x*blockIdx.x*4+threadIdx.x; //boundary check if (tid >= n) return; //convert global data pointer to the int tempSum=0; if(idx+3 * blockDim.x<=n) { int a1=g_idata[idx]; int a2=g_idata[idx+blockDim.x]; int a3=g_idata[idx+2*blockDim.x]; int a4=g_idata[idx+3*blockDim.x]; tempSum=a1+a2+a3+a4;
} smem[tid]=tempSum; __syncthreads(); //in-place reduction in global memory if(blockDim.x>=1024 && tid <512) smem[tid]+=smem[tid+512]; __syncthreads(); if(blockDim.x>=512 && tid <256) smem[tid]+=smem[tid+256]; __syncthreads(); if(blockDim.x>=256 && tid <128) smem[tid]+=smem[tid+128]; __syncthreads(); if(blockDim.x>=128 && tid <64) smem[tid]+=smem[tid+64]; __syncthreads(); //write result for this block to global mem if(tid<32) { volatileint *vsmem = smem; vsmem[tid]+=vsmem[tid+32]; vsmem[tid]+=vsmem[tid+16]; vsmem[tid]+=vsmem[tid+8]; vsmem[tid]+=vsmem[tid+4]; vsmem[tid]+=vsmem[tid+2]; vsmem[tid]+=vsmem[tid+1];
}
if (tid == 0) g_odata[blockIdx.x] = smem[0];
}
这段代码就是多了其他三块的求和:
1 2 3 4 5 6 7 8 9 10 11 12 13
unsignedint idx = blockDim.x*blockIdx.x*4+threadIdx.x; //boundary check if (tid >= n) return; //convert global data pointer to the int tempSum=0; if(idx+3 * blockDim.x<=n) { int a1=g_idata[idx]; int a2=g_idata[idx+blockDim.x]; int a3=g_idata[idx+2*blockDim.x]; int a4=g_idata[idx+3*blockDim.x]; tempSum=a1+a2+a3+a4; }