1、cuda的学习记录Chapter01
基本思想:学习一下cuda编程,随手记录一下 如何使用grind block thread 线程块和编号的概念和应用方法
一、使用工具clion+ubuntu
CMakeLists.txt
cmake_minimum_required(VERSION 3.0)
project(untitled18)
set(CMAKE_CXX_STANDARD 14)
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
cuda_add_executable(untitled18 main.cu)
main.cu
#include <iostream>
#include <stdio.h>
#include "cuda.h"
__global__ void hello_from_gpu()
{
int x=blockDim.x*blockIdx.x+threadIdx.x;
printf("blockIdx.x %d threadIdx.x %d x %d\n",blockIdx.x,threadIdx.x,x);
//printf(" %d \n",x);
//printf(" hello world\n");
}
int main() {
hello_from_gpu<<<2,9>>>();// grid_dim block_dim ===> block_id thread_id
cudaDeviceSynchronize();
return 0;
}
测试结果
/home/ubuntu/CLionProjects/untitled18/cmake-build-debug/untitled18
blockIdx.x 0 threadIdx.x 0 x 0
blockIdx.x 0 threadIdx.x 1 x 1
blockIdx.x 0 threadIdx.x 2 x 2
blockIdx.x 0 threadIdx.x 3 x 3
blockIdx.x 0 threadIdx.x 4 x 4
blockIdx.x 0 threadIdx.x 5 x 5
blockIdx.x 0 threadIdx.x 6 x 6
blockIdx.x 0 threadIdx.x 7 x 7
blockIdx.x 0 threadIdx.x 8 x 8
blockIdx.x 1 threadIdx.x 0 x 9
blockIdx.x 1 threadIdx.x 1 x 10
blockIdx.x 1 threadIdx.x 2 x 11
blockIdx.x 1 threadIdx.x 3 x 12
blockIdx.x 1 threadIdx.x 4 x 13
blockIdx.x 1 threadIdx.x 5 x 14
blockIdx.x 1 threadIdx.x 6 x 15
blockIdx.x 1 threadIdx.x 7 x 16
blockIdx.x 1 threadIdx.x 8 x 17
Process finished with exit code 0
二、测试程序
#include <iostream>
#include <stdio.h>
#include "cuda.h"
#include "algorithm"
#include "chrono"
__global__ void hello_from_gpu(int *aa,int *bb,int *cc)
{
int index=blockDim.x*blockIdx.x+threadIdx.x;
cc[index]=aa[index]+bb[index];
printf("blockIdx.x %d threadIdx.x %d index %d \n",blockIdx.x,threadIdx.x,index);
}
int main() {
int n=10;
int *a=(int *)malloc(sizeof(int)*n);
for(int i=0;i<n;i++){
a[i]=2*i;
}
int *b=(int *)malloc(sizeof(int)*n);
for(int i=0;i<n;i++){
b[i]=i;
}
int *c=(int *)malloc(sizeof(int)*n);
int *aa,*bb,*cc;
cudaMalloc((void **)&aa,sizeof(int)*n);
cudaMalloc((void **)&bb,sizeof(int)*n);
cudaMalloc((void **)&cc,sizeof(int)*n);
cudaMemcpy(aa,a,sizeof(int)*n,cudaMemcpyHostToDevice);
cudaMemcpy(bb,b,sizeof(int)*n,cudaMemcpyHostToDevice);
auto t1 = std::chrono::system_clock::now();
hello_from_gpu<<<2,5>>>(aa,bb,cc);// grid_dim block_dim ===> block_id thread_id
cudaDeviceSynchronize();
auto t2 = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1);
std::cout << "Time difference: " << elapsed.count() << " ns" << std::endl;
cudaMemcpy(c,cc,sizeof(int)*n,cudaMemcpyDeviceToHost);
for(int i=0;i<n;i++){
printf("%d+%d=%d\n",a[i],b[i],c[i]);
}
free(a);
free(b);
free(c);
cudaFree(aa);
cudaFree(bb);
cudaFree(cc);
return 0;
}
测试结果
/home/ubuntu/CLionProjects/untitled18/cmake-build-debug/untitled18
blockIdx.x 0 threadIdx.x 0 index 0
blockIdx.x 0 threadIdx.x 1 index 1
blockIdx.x 0 threadIdx.x 2 index 2
blockIdx.x 0 threadIdx.x 3 index 3
blockIdx.x 0 threadIdx.x 4 index 4
blockIdx.x 1 threadIdx.x 0 index 5
blockIdx.x 1 threadIdx.x 1 index 6
blockIdx.x 1 threadIdx.x 2 index 7
blockIdx.x 1 threadIdx.x 3 index 8
blockIdx.x 1 threadIdx.x 4 index 9
Time difference: 79481 ns
0+0=0
2+1=3
4+2=6
6+3=9
8+4=12
10+5=15
12+6=18
14+7=21
16+8=24
18+9=27
Process finished with exit code 0