unroll
int H_out = H – K + 1;
int W_out = W – K + 1;
for (int b = 0; b < B; ++b)
for (int c = 0; c < C; ++c) { int w_base = c (KK); for (int p = 0; p < K; ++p)
for (int q = 0; q < K; ++q) { for(inth=0;h< H_out;++h)
for (int w = 0; w < W_out; ++w) {
int w_unroll = w_base + p * K + q;
int h_unroll = h * W_out + w;
X_unroll[b, h_unroll, w_unroll] = X[b, c, h + p, w + q];
|
divide by for replication
Unroll size CKK x H_out*W_out |
Input (/ for rep) C(H_out+K-1)(W_out+K-1) |
forward conv
int m = blockIdx.x;
int h = blockIdx.y / W_grid + threadIdx.y;
int w = blockIdx.y % W_grid + threadIdx.x;
float acc = 0.;
for(intc=0; c<C;c++){
for(intp=0;p<K;p++)
for (int q = 0; q < K; q++)
acc += X[c, h + p, w + q] * W[m, c, p, q];
}
Y[m, h, w] = acc;
|
matrix
int X_tile_width = TILE_WIDTH + K-1;
extern __shared__ float shmem[];
float* X_shared = &shmem[0];
float W_shared = &shmem[X_tile_width X_tile_width];
m = blockIdx.x;
h_base = (blockIdx.z / W_grid) TILE_SIZE; the block w_base = (blockIdx.z % W_grid) TILE_SIZE; x = threadIdx.x; ty = threadIdx.y; h = h_base + tx; w = w_base + ty;
float acc = 0.;
for (c = 0; c < C; c++)
if (( ty < K) && ( tx < K)) W_shared[ty, tx]= W [m, c, ty, tx];
__syncthreads();
for (int i = h; i < h_base + X_tile_width; i += TILE_WIDTH) { for (int j = w; j < w_base + X_tile_width; j += TILE_WIDTH)
X_shared[i - h_base, j - w_base] = X[n, c, i, j]}__syncthreads();}Y[n, m, h, w] = acc;
|
|
|
calculations
T/B 1024 |
warps 32 T |
SM 8 B 1536 T |
Histogram private
__shared__ unsigned int histo_private[256];
if (threadIdx.x < 256) histo_private[threadidx.x] = 0;
__syncthreads();
int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x; while (i < size)
/nonprivate while (i < size) {
atomicAdd( &(histo[buffer[i]]), 1);
i += stride; }/ private {
atomicAdd( &(private_histo[buffer[i]), 1);
i += stride; }
__syncthreads();
if (threadIdx.x < 256)
atomicAdd( &(histo[threadIdx.x]), private_histo[threadIdx.x] );
|
CSR ELL COO JDS JDST
int row = blockIdx.x * blockDim.x + threadIdx.x; if (row < num_rows) {
int row_start = row_ptr[row];
int row_end = row_ptr[row+1];
for (int elem = row_start; elem < row_end; elem++) {
dot += data[elem] * x[col_index[elem]]; y[row] = dot;
for(inti=0;i<num_elem;i++){
dot += data[row+inum_rows]x[col_index[row+i*num_rows]]; y[row] = dot;
for (int i = 0; i < num_elem; row++)
y[row_index[i]] += data[i] * x[col_index[i]];
int row_start = jds_row_ptr[row];
int row_end = jds_row_ptr[row+1];
for (int elem = row_start; elem < row_end; elem++) {
dot += data[elem] * x[col_index[elem]]; y[jds_row_index[row]] = dot;
unsigned in sec = 0;
while (jds_t_col_ptr[sec+1]-jds_t_col_ptr[sec] > row){ dot += data[jds_t_col_ptr[sec]+row] *
x[col_index[jds_t_col_ptr[sec]+row]]; sec++;
y[jds_row_index[row]] = dot;
|
Index needed
CSR k,k,m+1 |
COO k,k,k |
ELL mN,mN |
JDS/T k,k m+1,m/2m,n |
|
|
Async
cudaMemcpyAsync(d_B0, h_B+I, SegSize*sizeof(float),.., stream0); |
cudaMemcpyAsync(h_C+i, d_C0, SegSize*sizeof(float),.., stream0); |
cudaMemcpyAsync(d_B1, h_B+i+SegSize, SegSize*sizeof(float),.., stream1); |
cudaMemcpyAsync(h_C+i+SegSize, d_C1, SegSize*sizeof(float),.., stream1); |
KG BK Scan
int i = bd * blockIdx.x + tx;
if (i < len) XY[tx] = input[i];
__syncthreads();
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
if (threadIdx.x >= stride) XY[threadIdx.x] += XY[threadIdx.x-stride];
}
Y[i] = XY[threadIdx.x];
for (unsigned int stride = 1; stride <= BLOCK_SIZE; stride *= 2) {
int index = (tx + 1) stride 2 - 1;
if (index < 2 * BLOCK_SIZE)
XY[index] += XY[index - stride];
__syncthreads();
for (unsigned int stride = BLOCK_SIZE / 2; stride > 0; stride /= 2) {
__syncthreads();
int index = (tx + 1) stride 2 - 1;
if (index + stride < 2 * BLOCK_SIZE)
XY[index + stride] += XY[index];
output[i] = XY[tx];
if ((i + 1) % bd == 0)
blockSums[blockIdx.x] = output[i];
if (i < len && blockIdx.x > 0)
for (int bl = 0; bl < blockIdx.x; ++bl)
output[i] += blockSums[bl];
|
|