sdfsd Cheat Sheet

unroll

int H_out = H – K + 1;
int W_out = W – K + 1;
for (int b = 0; b < B; ++b)
for (int c = 0; c < C; ++c) { int w_base = c (KK); for (int p = 0; p < K; ++p)
for (int q = 0; q < K; ++q) { for(inth=0;h< H_out;++h)
for (int w = 0; w < W_out; ++w) {
int w_unroll = w_base + p * K + q;
int h_unroll = h * W_out + w;
X_unroll[b, h_unroll, w_unroll] = X[b, c, h + p, w + q];

divide by for replication

Unroll size CKK x H_out*W_out

Input (/ for rep) C(H_out+K-1)(W_out+K-1)

forward conv

int m = blockIdx.x;
int h = blockIdx.y / W_grid + threadIdx.y;
int w = blockIdx.y % W_grid + threadIdx.x;
float acc = 0.;
for(intc=0; c<C;c++){
for(intp=0;p<K;p++) 
for (int q = 0; q < K; q++)
acc += X[c, h + p, w + q] * W[m, c, p, q];
}
    Y[m, h, w] = acc;

matrix

int X_tile_width = TILE_WIDTH + K-1;
extern __shared__ float shmem[];
float* X_shared = &shmem[0];
float W_shared = &shmem[X_tile_width X_tile_width];
m = blockIdx.x;
h_base = (blockIdx.z / W_grid) TILE_SIZE; the block w_base = (blockIdx.z % W_grid) TILE_SIZE; x = threadIdx.x; ty = threadIdx.y; h = h_base + tx; w = w_base + ty;
float acc = 0.;
for (c = 0; c < C; c++)
if (( ty < K) && ( tx < K)) W_shared[ty, tx]= W [m, c, ty, tx];
__syncthreads();
for (int i = h; i < h_base + X_tile_width; i += TILE_WIDTH) { for (int j = w; j < w_base + X_tile_width; j += TILE_WIDTH)
X_shared[i - h_base, j - w_base] = X[n, c, i, j]}__syncthreads();}Y[n, m, h, w] = acc;

calculations

T/B 1024	warps 32 T
SM 8 B 1536 T

Histogram private

__shared__ unsigned int histo_private[256];
if (threadIdx.x < 256) histo_private[threadidx.x] = 0;
__syncthreads();
int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x; while (i < size)
/nonprivate while (i < size) {
atomicAdd( &(histo[buffer[i]]), 1);
i += stride; }/ private {
atomicAdd( &(private_histo[buffer[i]), 1);
i += stride; }
__syncthreads();
if (threadIdx.x < 256)
atomicAdd( &(histo[threadIdx.x]), private_histo[threadIdx.x] );

CSR ELL COO JDS JDST

int row = blockIdx.x * blockDim.x + threadIdx.x; if (row < num_rows) {
int row_start = row_ptr[row];
int row_end = row_ptr[row+1];
for (int elem = row_start; elem < row_end; elem++) {
dot += data[elem] * x[col_index[elem]]; y[row] = dot;

for(inti=0;i<num_elem;i++){
dot += data[row+inum_rows]x[col_index[row+i*num_rows]]; y[row] = dot;

for (int i = 0; i < num_elem; row++)
y[row_index[i]] += data[i] * x[col_index[i]];

int row_start = jds_row_ptr[row];
int row_end = jds_row_ptr[row+1];
for (int elem = row_start; elem < row_end; elem++) {
dot += data[elem] * x[col_index[elem]]; y[jds_row_index[row]] = dot;

unsigned in sec = 0;
while (jds_t_col_ptr[sec+1]-jds_t_col_ptr[sec] > row){ dot += data[jds_t_col_ptr[sec]+row] *
x[col_index[jds_t_col_ptr[sec]+row]]; sec++;
y[jds_row_index[row]] = dot;

Index needed

CSR k,k,m+1	COO k,k,k
ELL mN,mN	JDS/T k,k m+1,m/2m,n

Async

cudaMemcpyAsync(d_B0, h_B+I, SegSize*sizeof(float),.., stream0);

cudaMemcpyAsync(h_C+i, d_C0, SegSize*sizeof(float),.., stream0);

cudaMemcpyAsync(d_B1, h_B+i+SegSize, SegSize*sizeof(float),.., stream1);

cudaMemcpyAsync(h_C+i+SegSize, d_C1, SegSize*sizeof(float),.., stream1);

KG BK Scan

int i = bd * blockIdx.x + tx;
if (i < len) XY[tx] = input[i];
__syncthreads();

for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
if (threadIdx.x >= stride) XY[threadIdx.x] += XY[threadIdx.x-stride];
}
Y[i] = XY[threadIdx.x];

for (unsigned int stride = 1; stride <= BLOCK_SIZE; stride *= 2) {
int index = (tx + 1)  stride  2 - 1;
if (index < 2 * BLOCK_SIZE)
XY[index] += XY[index - stride];
__syncthreads();
 for (unsigned int stride = BLOCK_SIZE / 2; stride > 0; stride /= 2) {
__syncthreads();
int index = (tx + 1)  stride  2 - 1;
if (index + stride < 2 * BLOCK_SIZE)
XY[index + stride] += XY[index];
output[i] = XY[tx];
if ((i + 1) % bd == 0)
blockSums[blockIdx.x] = output[i];
   
if (i < len && blockIdx.x > 0)
for (int bl = 0; bl < blockIdx.x; ++bl)
output[i] += blockSums[bl];

sdfsd Cheat Sheet (DRAFT) by mte33

unroll

divide by for replication

forward conv

matrix

calculations

Histogram private

CSR ELL COO JDS JDST

Index needed

Async

KG BK Scan

Latest Cheat Sheet

Random Cheat Sheet

About Cheatography

Behind the Scenes

Recent Cheat Sheet Activity

Please Disable Your Ad Blocker

sdfsd Cheat Sheet (DRAFT) by mte33

unroll

divide by for replic­ation

forward conv

matrix

calcul­ations

Histogram private

CSR ELL COO JDS JDST

Index needed

Async

KG BK Scan

Latest Cheat Sheet

Random Cheat Sheet

About Cheatography

Behind the Scenes

Recent Cheat Sheet Activity

Please Disable Your Ad Blocker

divide by for replication

calculations