cuda笔记-GPU多线程的奇偶排序

首先说明奇偶排序:

算法的思路是先排奇数序号的相邻2个,或者偶数序号的相邻两个,然后一直到序列有序为止,如下代码:

#include<stdio.h>
#include<stdlib.h>
#include<stdbool.h>
#include <iostream>

using namespace std;


void swap(int *a, int *b)
{
	int t;
	t = *a;
	*a = *b;
	*b = t;
}
void printArray(int a[], int count)
{
	int i;
	for (i = 0; i < count; i++)
		printf("%d ", a[i]);
	printf("\n");
}
void Odd_even_sort(int a[], int size)
{
	bool sorted = false;
	int count = 1;
	while (!sorted)
	{
		sorted = true;
		for (int i = 1; i < size - 1; i += 2)
		{
			if (a[i] > a[i + 1])
			{
				swap(&a[i], &a[i + 1]);
				sorted = false;
			}
		}
		for (int i = 0; i < size - 1; i += 2)
		{
			if (a[i] > a[i + 1])
			{
				swap(&a[i], &a[i + 1]);
				sorted = false;
			}
		}
		
		cout << "count:" << count << endl;
	}
}
int main(void)
{
	int a[] = { 9, 8, 7, 6, 5, 4, 3, 2, 1 };
	int n = sizeof(a) / sizeof(*a);
	Odd_even_sort(a, n);
	printArray(a, n);
	getchar();
	return 0;
}

程序运行截图如下:

这个奇偶排序还有个特点:

奇偶排序实际上在多处理器环境中很有用,处理器可以分别同时处理每一个奇数对,然后又同时处理偶数对。因为奇数对是彼此独立的,每一刻都可以用不同的处理器比较和交换。这样可以非常快速地排序。

 

下面就用cuda来实现下,其中有个要说明的地方:

cuda中__syncthreads()的作用:block内部用于线程同步,就是同一block内所有线程执行至__syncthreads()处等待全部线程执行完毕后再继续。

代码如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>

using namespace std;

#pragma comment(lib, "cudart.lib")
#pragma comment(lib, "curand.lib")

void Matrix_init(float *a, int N) {

	curandGenerator_t gen;
	curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);
	curandSetPseudoRandomGeneratorSeed(gen, 11ULL);
	curandGenerateUniform(gen, a, N);
}




__global__ void sort(float *a, int N) {

	int x = threadIdx.x;
	extern __shared__ float s_a[];
	s_a[x] = a[x];
	__syncthreads();

	for (int i = 0; i < N; i++)
	{
		int j = i % 2;
		int idx = 2 * x + j;
		if (idx + 1 < N && s_a[idx] < s_a[idx + 1]) {
			float tmp = s_a[idx];
			s_a[idx] = s_a[idx + 1];
			s_a[idx + 1] = tmp;
		}
		__syncthreads();
	}

	a[x] = s_a[x];
	__syncthreads();
}

int main() {

	int m = 16;
	int N = 256;

	
	float *p_d, *p_h, *p_hs;
	p_h = (float*)malloc(N * sizeof(float));
	p_hs = (float*)malloc(N * sizeof(float));
	cudaMalloc((void**)&p_d, N * sizeof(float));

	Matrix_init(p_d, N);
	cudaMemcpy(p_h, p_d, N * sizeof(float), cudaMemcpyDeviceToHost);
	
	for (int i = 0; i < N; i++) {

		if (i % m == 0) {

			printf("\n");
		}
		cout << " " << p_h[i] << " ";
	}

	cout << endl << "sort:" << endl;
	sort << <1, N, N * sizeof(float) >> > (p_d, N);
	cudaMemcpy(p_h, p_d, N * sizeof(float), cudaMemcpyDeviceToHost);
	for (int i = 0; i < N; i++) {

		if (i%m == 0) 
			printf("\n");

		cout << " " << p_h[i] << " ";
	}


	cudaFree(p_d);
	free(p_h);
	free(p_hs);

	
	getchar();
	return 0;
}

程序运行截图如下:

 

相关推荐
©️2020 CSDN 皮肤主题: 数字20 设计师:CSDN官方博客 返回首页