最近在学cuda,发现自己数学方面的知识不太够,C语言的知识也有待加强。
这里记录个笔记对矩阵求卷积。
逻辑是这样的:
1. 先CUDA生成一个16*16的矩阵;
2. 将这16*16的矩阵,外面包一层0,也就变成18*18的矩阵。
3. 然后再开18*18个线程,进行矩阵的卷积
程序运行截图如下:
源码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>
using namespace std;
#pragma comment(lib, "cudart.lib")
#pragma comment(lib, "curand.lib")
#define N 16
__global__ void Matrix_convolution(float *a, float *b) {
int x = threadIdx.x;
int y = threadIdx.y;
__shared__ float shared[(N + 2) * (N + 2)];
shared[y * (N + 2) + x] = a[y * (N + 2) + x];
__syncthreads();
int convolution[9] = { 1, 1, 1, 1, -8, 1, 1, 1, 1 };
//对shared进行卷积
int pos = y * (N + 2) + x;
//第一行和最后一行不要卷积
if (pos < N + 2 || (pos >(N + 2) * (N + 1))) {
return;
}
//最左边和最右边一行不要卷积
if (pos % (N + 2) == 0 || pos % (N + 2) == N + 1) {
return;
}
//卷积的9个值
float a00 = shared[(y * (N + 2) + x) - (N + 2) - 1];
float a01 = shared[(y * (N + 2) + x) - (N + 2)];
float a02 = shared[(y * (N + 2) + x) - (N + 2) + 1];
float a10 = shared[(y * (N + 2) + x) - 1];
float a11 = shared[(y * (N + 2) + x)];
float a12 = shared[(y * (N + 2) + x) + 1];
float a20 = shared[(y * (N + 2) + x) + (N + 2) - 1];
float a21 = shared[(y * (N + 2) + x) + (N + 2)];
float a22 = shared[(y * (N + 2) + x) + (N + 2) + 1];
float ret = convolution[0] * a00 + convolution[1] * a01 + convolution[2] * a02
+ convolution[3] * a10 + convolution[4] * a11 + convolution[5] * a12 + convolution[6] * a20
+ convolution[7] * a21 + convolution[8] * a22;
//目前在多少行
int rowCount = (y * (N + 2) + x) / (N + 2);
int posOffset = (N + 2) + (rowCount - 1) * 2 + 1;
b[y * (N + 1) + x - posOffset] = ret;
}
void Matrix_init_gpu(float *a) {
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);
curandSetPseudoRandomGeneratorSeed(gen, 11ULL);
curandGenerateUniform(gen, a, N * N);
}
int main() {
float *p_original_gpu;
float *p_original_cpu;
float *p_final_cpu;
float *p_final_gpu;
float *p_ret_gpu;
float *p_ret_cpu;
p_original_cpu = (float*)malloc(N * N * sizeof(float));
p_final_cpu = (float*)malloc((N + 2) * (N + 2) * sizeof(float));
p_ret_cpu = (float*)malloc(N * N * sizeof(float));
cudaMalloc((void**)&p_original_gpu, N * N * sizeof(float));
cudaMalloc((void**)&p_final_gpu, (N + 2) * (N + 2) * sizeof(float));
cudaMalloc((void**)&p_ret_gpu, N * N * sizeof(float));
Matrix_init_gpu(p_original_gpu);
cudaMemcpy(p_original_cpu, p_original_gpu, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0; i < N * N; i++){
if (i % N == 0) printf("\n");
cout << p_original_cpu[i] << " ";
}
//开始填充数据
for (int i = 0; i < (N + 2) * (N + 2); i++) {
p_final_cpu[i] = 0;
}
int pos = 0;
for (int i = N + 2; i < (N + 2) * (N + 1); i++) {
if (i % (N + 2) != 0 && i % (N + 2) != N + 1) {
p_final_cpu[i] = p_original_cpu[pos++];
}
}
cout << "\n\n填充数据:" << endl;
for (int i = 0; i < (N + 2) * (N + 2); i++) {
if (i % (N + 2) == 0) printf("\n");
cout << p_final_cpu[i] << " ";
}
cout << "\n\n最后结果:" << endl;
cudaMemcpy(p_final_gpu, p_final_cpu, (N + 2) * (N + 2) * sizeof(float), cudaMemcpyHostToDevice);
Matrix_convolution << <1, (N + 2) * (N + 2) >> >(p_final_gpu, p_ret_gpu);
cudaMemcpy(p_ret_cpu, p_ret_gpu, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N * N; i++) {
if (i % N == 0) printf("\n");
cout << p_ret_cpu[i] << " ";
}
cudaFree(p_original_gpu);
cudaFree(p_final_gpu);
cudaFree(p_ret_gpu);
free(p_original_cpu);
free(p_final_cpu);
free(p_ret_cpu);
getchar();
return 0;
}