matrix multiply(cuda)

Posted on 2026-02-03 In AI Infra , CUDA C

需要在掌握矩阵加法的基础上，学习共享内存(__shared__)以及tile分块策略

同一个block中的所有thread共享一份共享内存

先将经常需要使用的数据从内存中转移到共享内存，在加载数据的时候就直接从共享内存中加载而不是从内存中加载，从而加快速度

同时在具体实现的时候使用了一个trick：分块计算矩阵乘法。这样做可以完美地与共享内存的机制进行搭配使用。

代码实现如下（相关具体实现细节在代码注释说明）：

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

const int M = 200;
const int K = 400;
const int N = 500;

// 定义分块的大小
const int TILE_SIZE = 32;

inline void init(int* mat, int row, int col){
    for(int i = 0; i < row; i ++)
        for(int j = 0; j < col; j ++)
            mat[i * col + j] = i + j;
}
inline void print(int* mat, int row, int col){
    printf("=========================================\n");
    for(int i = 0; i < row; i ++){
        for(int j = 0; j < col; j ++)
            printf("%d ", mat[i * col + j]);
        printf("\n");
    }
    printf("=========================================\n");
}

__global__ void mul(int* a, int* b, int* c, int M, int K, int N){
    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;

    int col = bx * TILE_SIZE + tx;
    int row = by * TILE_SIZE + ty;

	// 定义共享内存 ds_a存储A矩阵的部分特定数据， ds_b存储B矩阵部分特定数据
    __shared__ int ds_a[TILE_SIZE][TILE_SIZE];
    __shared__ int ds_b[TILE_SIZE][TILE_SIZE];

    //存储最终的计算得到的值
    int val = 0;

    // 这里一个很关键的思想，因为无法将整个矩阵都放进共享内存中（共享内存太小了），所以分阶段处理计算某个点的数据值，这并不违背矩阵乘法的计算法则
    for(int ph = 0; ph < (K + TILE_SIZE - 1) / TILE_SIZE; ph ++){
        // 将A矩阵的部分数值复制到共享内存ds_a中
        if(row < M && (ph * TILE_SIZE + tx) < K)//这里是为了防止超过边界，如果超过就赋值为0
            ds_a[ty][tx] = a[row * K + (ph * TILE_SIZE + tx)];
        else
            ds_a[ty][tx] = 0;

        // B矩阵的相关逻辑，完全同理
        if((ph * TILE_SIZE + ty) < K && col < N)
            ds_b[ty][tx] = b[(ph * TILE_SIZE + ty) * N + col];
        else
            ds_b[ty][tx] = 0;
		
        // 这个函数是指：让跑的快的线程等等跑的慢的线程，等同一个block的thread跑完之后再进行下一行
        // 这里的作用是为了防止数据还没搬运完就开始进行乘法运算了
        __syncthreads();

        // 将现有共享内存中储存的部分的数据进行矩阵乘法
        for(int i = 0; i < TILE_SIZE; i++)
            val += ds_a[ty][i] * ds_b[i][tx];

        //同样，这里也需要加一个，目的是为了防止跑得快的线程进入下一个循环将数据覆盖了
        __syncthreads();
    }

    // 超过最终矩阵的大小
    if(col >= N || row >= M)
        return;
    
    // 写入
    c[row * N + col] = val;
}

int main(){

    int *a_h, *b_h, *c_h;
    int *a_d, *b_d, *c_d;

    size_t size_a = sizeof(int) * M * K;
    size_t size_b = sizeof(int) * K * N;
    size_t size_c = sizeof(int) * M * N;

    a_h = (int*)malloc(size_a);
    b_h = (int*)malloc(size_b);
    cudaMalloc((void**)&a_d, size_a);
    cudaMalloc((void**)&b_d, size_b);
    cudaMalloc((void**)&c_d, size_c);

    init(a_h, M, K);
    init(b_h, K, N);

    cudaMemcpy(a_d, a_h, size_a, cudaMemcpyHostToDevice);
    cudaMemcpy(b_d, b_h, size_b, cudaMemcpyHostToDevice);

    dim3 block_size(TILE_SIZE, TILE_SIZE);
    dim3 grid_size(
        (N + block_size.x - 1)/block_size.x,
        (M + block_size.y - 1)/block_size.y
    );

    mul<<<grid_size, block_size>>>(a_d, b_d, c_d, M, K, N);


    c_h = (int*)malloc(size_c);
    cudaMemcpy(c_h, c_d, size_c, cudaMemcpyDeviceToHost);
    print(c_h, 5, 5);


    free(a_h);free(b_h);free(c_h);
    cudaFree(a_d);cudaFree(b_d);cudaFree(c_d);

    return 0;
}