1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
| #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h>
const int M = 200; const int K = 400; const int N = 500;
const int TILE_SIZE = 32;
inline void init(int* mat, int row, int col){ for(int i = 0; i < row; i ++) for(int j = 0; j < col; j ++) mat[i * col + j] = i + j; } inline void print(int* mat, int row, int col){ printf("=========================================\n"); for(int i = 0; i < row; i ++){ for(int j = 0; j < col; j ++) printf("%d ", mat[i * col + j]); printf("\n"); } printf("=========================================\n"); }
__global__ void mul(int* a, int* b, int* c, int M, int K, int N){ int bx = blockIdx.x, by = blockIdx.y; int tx = threadIdx.x, ty = threadIdx.y;
int col = bx * TILE_SIZE + tx; int row = by * TILE_SIZE + ty;
__shared__ int ds_a[TILE_SIZE][TILE_SIZE]; __shared__ int ds_b[TILE_SIZE][TILE_SIZE];
int val = 0;
for(int ph = 0; ph < (K + TILE_SIZE - 1) / TILE_SIZE; ph ++){ if(row < M && (ph * TILE_SIZE + tx) < K) ds_a[ty][tx] = a[row * K + (ph * TILE_SIZE + tx)]; else ds_a[ty][tx] = 0;
if((ph * TILE_SIZE + ty) < K && col < N) ds_b[ty][tx] = b[(ph * TILE_SIZE + ty) * N + col]; else ds_b[ty][tx] = 0; __syncthreads();
for(int i = 0; i < TILE_SIZE; i++) val += ds_a[ty][i] * ds_b[i][tx];
__syncthreads(); }
if(col >= N || row >= M) return; c[row * N + col] = val; }
int main(){
int *a_h, *b_h, *c_h; int *a_d, *b_d, *c_d;
size_t size_a = sizeof(int) * M * K; size_t size_b = sizeof(int) * K * N; size_t size_c = sizeof(int) * M * N;
a_h = (int*)malloc(size_a); b_h = (int*)malloc(size_b); cudaMalloc((void**)&a_d, size_a); cudaMalloc((void**)&b_d, size_b); cudaMalloc((void**)&c_d, size_c);
init(a_h, M, K); init(b_h, K, N);
cudaMemcpy(a_d, a_h, size_a, cudaMemcpyHostToDevice); cudaMemcpy(b_d, b_h, size_b, cudaMemcpyHostToDevice);
dim3 block_size(TILE_SIZE, TILE_SIZE); dim3 grid_size( (N + block_size.x - 1)/block_size.x, (M + block_size.y - 1)/block_size.y );
mul<<<grid_size, block_size>>>(a_d, b_d, c_d, M, K, N);
c_h = (int*)malloc(size_c); cudaMemcpy(c_h, c_d, size_c, cudaMemcpyDeviceToHost); print(c_h, 5, 5);
free(a_h);free(b_h);free(c_h); cudaFree(a_d);cudaFree(b_d);cudaFree(c_d);
return 0; }
|