1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
| #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h>
const int N = 1000;
inline void init(int* mat){ for(int i = 0; i < N; i++) for(int j = 0; j < N; j++){ int val = i + j + 1; mat[i * N + j] = val; } } inline void print(int* mat, int n){ for(int i = 0; i < n; i++){ for(int j = 0; j < n; j++) printf("%d ", mat[i * N + j]); printf("\n"); } }
__global__ void add(int* A, int* B, int* C){ int col = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.y * blockDim.y + threadIdx.y;
if(row >= N || col >= N) return;
int index = row * N + col;
C[index] = A[index] + B[index]; }
int main(){
int *a_h, *b_h, *c_h; int *a_d, *b_d, *c_d; size_t size = sizeof(int) * N * N; a_h = (int*)malloc(size); b_h = (int*)malloc(size); cudaMalloc((void**)&a_d, size); cudaMalloc((void**)&b_d, size); cudaMalloc((void**)&c_d, size);
init(a_h); init(b_h); cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice); cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice);
dim3 block_size(16, 16); dim3 grid_size( (N + block_size.x - 1) / block_size.x, (N + block_size.y - 1) / block_size.y ); add<<<grid_size, block_size>>>(a_d, b_d, c_d);
c_h = (int*)malloc(size); cudaMemcpy(c_h, c_d, size, cudaMemcpyDeviceToHost);
print(c_h, 6);
free(a_h);free(b_h);free(c_h); cudaFree(a_d);cudaFree(b_d);cudaFree(c_d);
return 0; }
|