cuda笔记-一个Block多线程求卷积

最近在学cuda,发现自己数学方面的知识不太够,C语言的知识也有待加强。

这里记录个笔记对矩阵求卷积。

逻辑是这样的:

1. 先CUDA生成一个16*16的矩阵;

2. 将这16*16的矩阵,外面包一层0,也就变成18*18的矩阵。

3. 然后再开18*18个线程,进行矩阵的卷积

 

程序运行截图如下:

源码如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>

using namespace std;

#pragma comment(lib, "cudart.lib")
#pragma comment(lib, "curand.lib")

#define N 16

__global__ void Matrix_convolution(float *a, float *b) {

	int x = threadIdx.x;
	int y = threadIdx.y;
	__shared__ float shared[(N + 2) * (N + 2)];

	shared[y * (N + 2) + x] = a[y * (N + 2) + x];

	__syncthreads();

	int convolution[9] = { 1, 1, 1, 1, -8, 1, 1, 1, 1 };
	//对shared进行卷积

	int pos = y * (N + 2) + x;

	//第一行和最后一行不要卷积
	if (pos < N + 2 || (pos >(N + 2) * (N + 1))) {

		return;
	}

	//最左边和最右边一行不要卷积
	if (pos % (N + 2) == 0 || pos % (N + 2) == N + 1) {

		return;
	}

	//卷积的9个值
	float a00 = shared[(y * (N + 2) + x) - (N + 2) - 1];
	float a01 = shared[(y * (N + 2) + x) - (N + 2)];
	float a02 = shared[(y * (N + 2) + x) - (N + 2) + 1];
	float a10 = shared[(y * (N + 2) + x) - 1];
	float a11 = shared[(y * (N + 2) + x)];
	float a12 = shared[(y * (N + 2) + x) + 1];
	float a20 = shared[(y * (N + 2) + x) + (N + 2) - 1];
	float a21 = shared[(y * (N + 2) + x) + (N + 2)];
	float a22 = shared[(y * (N + 2) + x) + (N + 2) + 1];

	float ret = convolution[0] * a00 + convolution[1] * a01 + convolution[2] * a02
		+ convolution[3] * a10 + convolution[4] * a11 + convolution[5] * a12 + convolution[6] * a20
		+ convolution[7] * a21 + convolution[8] * a22;

	//目前在多少行
	int rowCount = (y * (N + 2) + x) / (N + 2);
	int posOffset = (N + 2) + (rowCount - 1) * 2 + 1;

	b[y * (N + 1) + x - posOffset] = ret;
}


void Matrix_init_gpu(float *a) {

	curandGenerator_t gen;
	curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);
	curandSetPseudoRandomGeneratorSeed(gen, 11ULL);
	curandGenerateUniform(gen, a, N * N);
}

int main() {

	float *p_original_gpu;
	float *p_original_cpu;
	float *p_final_cpu;
	float *p_final_gpu;
	float *p_ret_gpu;
	float *p_ret_cpu;
	p_original_cpu = (float*)malloc(N * N * sizeof(float));
	p_final_cpu = (float*)malloc((N + 2) * (N + 2) * sizeof(float));
	p_ret_cpu = (float*)malloc(N * N * sizeof(float));

	cudaMalloc((void**)&p_original_gpu, N * N * sizeof(float));
	cudaMalloc((void**)&p_final_gpu, (N + 2) * (N + 2) * sizeof(float));
	cudaMalloc((void**)&p_ret_gpu, N * N * sizeof(float));
	Matrix_init_gpu(p_original_gpu);
	cudaMemcpy(p_original_cpu, p_original_gpu, N * N * sizeof(float), cudaMemcpyDeviceToHost);

	for(int i = 0; i < N * N; i++){

		if (i % N == 0) printf("\n");
		cout << p_original_cpu[i] << " ";
	}

	//开始填充数据
	for (int i = 0; i < (N + 2) * (N + 2); i++) {

		p_final_cpu[i] = 0;
	}
	
	int pos = 0;
	for (int i = N + 2; i < (N + 2) * (N + 1); i++) {

		if (i % (N + 2) != 0 && i % (N + 2) != N + 1) {

			p_final_cpu[i] = p_original_cpu[pos++];
		}
	}
	
	

	cout << "\n\n填充数据:" << endl;
	for (int i = 0; i < (N + 2) * (N + 2); i++) {

		if (i % (N + 2) == 0) printf("\n");
		cout << p_final_cpu[i] << " ";
	}

	cout << "\n\n最后结果:" << endl;
	cudaMemcpy(p_final_gpu, p_final_cpu, (N + 2) * (N + 2) * sizeof(float), cudaMemcpyHostToDevice);

	Matrix_convolution << <1, (N + 2) * (N + 2) >> >(p_final_gpu, p_ret_gpu);
	cudaMemcpy(p_ret_cpu, p_ret_gpu, N * N * sizeof(float), cudaMemcpyDeviceToHost);

	for (int i = 0; i < N * N; i++) {

		if (i % N == 0) printf("\n");
		cout << p_ret_cpu[i] << " ";
	}

	cudaFree(p_original_gpu);
	cudaFree(p_final_gpu);
	cudaFree(p_ret_gpu);
	free(p_original_cpu);
	free(p_final_cpu);
	free(p_ret_cpu);

	getchar();
	return 0;
}

 

相关推荐
©️2020 CSDN 皮肤主题: 数字20 设计师:CSDN官方博客 返回首页