當前位置:網站首頁>【GPU加速】開發低延遲代碼性能提昇76.33%——通過VS2017創建CUDA項目對比CPU代碼和GPU代碼的延遲(親測代碼可運行簡單可運行適合入手)

【GPU加速】開發低延遲代碼性能提昇76.33%——通過VS2017創建CUDA項目對比CPU代碼和GPU代碼的延遲(親測代碼可運行簡單可運行適合入手)

2022-05-13 13:04:18量化Mike

一、在CPU上創建數組相加的算法:

#include "stdio.h"
#include<iostream>

//定義array元素的個數
#define N 10000000

//定義CPU上的函數
void cpuAdd(int *h_a, int *h_b, int *h_c) {
    
	int tid = 0;
	while (tid < N)
	{
    
		h_c[tid] = h_a[tid] + h_b[tid];
		tid += 1;
	}
}

int main04(void) {
    
	int h_a[N], h_b[N], h_c[N];

	//初始化兩個數組進行相加
	for (int i = 0; i < N; i++) {
    
		h_a[i] = 2 * i*i;
		h_b[i] = i;
	}

	//回調CPU上函數
	cpuAdd(h_a, h_b, h_c);

	//Printing Answer
	printf("Vector addition on CPU\n");

	for (int i = 0; i < N; i++) {
    
		printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}
	return 0;
}

在這裏插入圖片描述

二、在GPU上創建數組相加的算法:

#include <device_launch_parameters.h>
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>

//定義array元素的個數
#define N 10000000

//定義向量相加的內核函數
__global__ void gpuAdd05(int *d_a, int *d_b, int *d_c) {
    
	//Getting block index of current kernel
	int tid = blockIdx.x;	// handle the data at this index
	if (tid < N)
	{
     
		d_c[tid] = d_a[tid] + d_b[tid];
	}
}

int main(void) {
    
	//定義主機數組
	int h_a[N], h_b[N], h_c[N];
	//定義設備指針
	int *d_a, *d_b, *d_c;
	//分配內存空間
	cudaMalloc((void**)&d_a, N * sizeof(int));
	cudaMalloc((void**)&d_b, N * sizeof(int));
	cudaMalloc((void**)&d_c, N * sizeof(int));
	//初始化數組
	for (int i = 0; i < N; i++) {
    
		h_a[i] = 2 * i*i;
		h_b[i] = i;
	}
	// 從主機內存複制數組到設備顯存
	cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);

	//調用內核函數設置啟動方式為N個塊每個塊內1個線程
	gpuAdd05 << <N, 1 >> > (d_a, d_b, d_c);

	//複制設備計算結果到主機內存
	cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

	printf("Vector addition on GPU \n");

	//打印結果
	for (int i = 0; i < N; i++) {
    
		printf("向量加法為: %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}

	//釋放設備顯存
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	return 0;
}

在這裏插入圖片描述

三、對比上述CPU與GPU代碼的延遲

上述的代碼中我們設置的數組的元素個數N為5,這裏我們為了CPU執行時間和GPU執行時間的顯著差异,將N設置為30000。

#include <device_launch_parameters.h>
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <time.h>
#include <cuda_runtime.h>

//定義array元素的個數
#define N 30000


//定義CPU上向量相加的函數
void cpuAdd06(int *h_a, int *h_b, int *h_c) {
    
	int tid = 0;
	while (tid < N)
	{
    
		h_c[tid] = h_a[tid] + h_b[tid];
		tid += 1;
	}
}


//定義GPU上向量相加的內核函數
__global__ void gpuAdd06(int *d_a, int *d_b, int *d_c) {
    
	//Getting block index of current kernel
	int tid = blockIdx.x;	// handle the data at this index
	if (tid < N)
	{
    
		d_c[tid] = d_a[tid] + d_b[tid];
	}
}

void run_cpu_code(void)
{
    
	int h_a[N], h_b[N], h_c[N];

	//初始化兩個數組進行相加
	for (int i = 0; i < N; i++) {
    
		h_a[i] = 2 * i*i;
		h_b[i] = i;
	}

	//回調CPU上函數
	cpuAdd06(h_a, h_b, h_c);

	//Printing Answer
	printf("Vector addition on CPU\n");

	for (int i = 0; i < N; i++) {
    
		printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}
}

void run_gpu_code(void)
{
    
	//定義主機數組
	int h_a[N], h_b[N], h_c[N];

	//定義設備指針
	int *d_a, *d_b, *d_c;

	//分配內存空間
	cudaMalloc((void**)&d_a, N * sizeof(int));
	cudaMalloc((void**)&d_b, N * sizeof(int));
	cudaMalloc((void**)&d_c, N * sizeof(int));

	//初始化數組
	for (int i = 0; i < N; i++) {
    
		h_a[i] = 2 * i * i;
		h_b[i] = i;
	}

	//從主機內存複制數組到設備顯存
	cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);

	//調用內核函數設置啟動方式為N個塊每個塊內1個線程
	gpuAdd06 << <N, 4 >> > (d_a, d_b, d_c);

	//複制設備計算結果到主機內存
	cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

	printf("Vector addition on GPU \n");
	//Printing result on console
	for (int i = 0; i < N; i++) {
    
		printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}

	//Free up memory
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
}

int main(void) {
    

	clock_t start_h = clock();
	printf("執行CPU向量相加加法運算:\n");
	run_cpu_code();
	clock_t end_h = clock();

	clock_t start_d = clock();
	printf("執行GPU向量相加加法運算:\n");
	run_gpu_code();
	cudaThreadSynchronize();
	clock_t end_d = clock();

	double time_d = (double)(end_d - start_d) / CLOCKS_PER_SEC;
	double time_h = (double)(end_h - start_h) / CLOCKS_PER_SEC;
	printf("數組中元素的個數為:%d \n GPU上運行的時間為: %f seconds \n CPU上運行的時間為: %f Secondsln",N,time_d, time_h);

	return 0;
}

在這裏插入圖片描述
這裏我們對比可以發現,CPU運行耗時:4.157s、GPU運行耗時:0.984s。
我們計算一下便可發現提昇的性能:
(4.157 - 0.984 )/ 4.157 x 100% = 76.33 %
性能提昇76.33%

版權聲明
本文為[量化Mike]所創,轉載請帶上原文鏈接,感謝
https://cht.chowdera.com/2022/133/202205131259450083.html

隨機推薦