CUDA并行计算基础
目录
CUDA异构计算 CUDA处理流程 CUDA线程层次
Grid Block Thread blockldx threadldx gridDim blockDim CUDA存储结构 CUAD Python
并行计算模式
并行计算是同时应用多个计算资源解决一个计算问题:
- 涉及多个计算资源或处理器
- 问题被分解为多个离散的部分,可以同时处理(并行)
- 每个部分可以由一系列指令完成
任务处理的步骤
- 把输入数据从CPU内存复制到显存GPU
- 在执行芯片上缓存数据,加载GPU程序并执行
- 将计算结果从GPU显存中复制到CPU内存中
CUDA的线程层次
Grid 一维或多维线程块(block) Block 一组线程
一个Grid里面的每个Block的线程数是一样的 block内部的每个线程可以
- 同步 synchronize
- 访问共享存储器 shared memory
# -*- coding: utf-8 -*-
"""Untitled2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/11K0R66a7oBbLrVVaUDSJ4YTBUg_4-TL9
"""
!pip3 install numba
!nvcc -V
import cv2
import numpy as np
from numba import cuda
import time
import math
!wget https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/geforce-rtx-turing/overview/shop-2080-ti-1070@2x.jpg
mv shop-2080-ti-1070@2x.jpg test.jpg
ls
#gpu
@cuda.jit
def process_gpu(img,channels):
tx = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x
ty = cuda.blockIdx.y*cuda.blockDim.y+cuda.threadIdx.y
for c in range(channels):
color = img[tx,ty][c]*2.0+30
if color>255:
img[tx,ty][c]=255
elif color<0:
img[tx,ty][c]=0
else:
img[tx,ty][c]=color
#cpu
def process_cpu(img,dst):
rows,cols,channels = img.shape
for i in range(rows):
for j in range(cols):
for c in range(3):
color = img[i,j][c]*2.0+30
if color >255:
dst[i,j][c] = 255
elif color < 0:
dst[i,j][c] = 0
else:
dst[i,j][c] = color
if __name__ == "__main__":
#create an image.
img = cv2.imread('test.jpg')
rows,cols,channels=img.shape
dst_cpu = img.copy()
dst_gpu = img.copy()
start_cpu = time.time()
process_cpu(img,dst_cpu)
end_cpu = time.time()
time_cpu = end_cpu - start_cpu
print("CPU process time:" + str(time_cpu))
dImg = cuda.to_device(img)
threadsperblock = (16,16)
blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))
blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))
blockspergrid = (blockspergrid_x,blockspergrid_y)
cuda.synchronize()
start_gpu= time.time()
process_gpu[blockspergrid,threadsperblock](dImg,channels)
cuda.synchronize()
end_gpu = time.time()
dst_gpu = dImg.copy_to_host()
time_gpu = end_gpu - start_gpu
print("GPU process time:" + str(time_gpu))
#save
cv2.imwrite("result_cpu.jpg",dst_cpu)
cv2.imwrite("result_gpu.jpg",dst_gpu)
imshow
print("Done")
from google.colab.patches import cv2_imshow
cv2_imshow(dst_gpu)
cv2_imshow(img)