CUDA 显存IPC (Python版)

nxdong July 23, 2022 [python] #cuda

基于cuda的跨进程访问同一块显存的方案(显存IPC). c/c++的sample已经有很多啦,这里提供一个python版的.

前置准备

  1. 有个nvidia显卡
  2. 安装了显卡驱动
  3. 安装了cuda
  4. 安装了Python3

此功能需要显卡与cuda支持Unified Virtual Addressing(UVA).

安装依赖

pip3 install -i https://pypi.douban.com/simple/ cuda-python pip3 install -i https://pypi.douban.com/simple/ numpy

代码

from cuda import cuda, nvrtc, cudart import numpy as np import base64 import sys def print_cuda_info(): print("========== 设备信息 ==========") err, device_count = cudart.cudaGetDeviceCount() print("设备数量:", device_count) device_id = 0 err, prob = cudart.cudaGetDeviceProperties(device_id) print("设备信息:", prob) print("是否支持统一地址空间(UVA):", prob.unifiedAddressing) print("设备计算模式:", prob.computeMode) if device_count > 1: # 只有大于两张卡的时候才能判断是否支持peerAccess, 否则返回 cudaError_t.cudaErrorPeerAccessUnsupported: 217 err = cudart.cudaDeviceEnablePeerAccess(0,1) err, can = cudart.cudaDeviceCanAccessPeer(0,1) print("can access:", can) print("==============================") def ipc_main(memory_handle_file): BUFFER_SIZE = 1024 * np.uint8().itemsize print("BUFFER_SIZE:",BUFFER_SIZE) err, = cudart.cudaSetDevice(0) print("指定设备ID: ", err) host_buffer = np.full(BUFFER_SIZE, 1).astype(np.uint8) print("主机内存:", host_buffer) err, device_buffer_ptr = cudart.cudaMalloc(BUFFER_SIZE) print("申请显存:", device_buffer_ptr) err, ipc_mem_handle = cudart.cudaIpcGetMemHandle(device_buffer_ptr) memory_handle_str = base64.b64encode(ipc_mem_handle.reserved).decode('utf-8') print("获取IPC显存管理句柄:", memory_handle_str) with open(memory_handle_file, 'w') as f: f.write(memory_handle_str) print("显存IPC句柄写入文件:", memory_handle_file) err, = cudart.cudaMemcpy(device_buffer_ptr, host_buffer,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault) print("上传内容到显存:", err) a = input("按任意键获取子线程上传的显存值...") print("开始获取子程序设置的显存值...") host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8) print("初始化全零的主机内存:", host_buffer) err, = cudart.cudaMemcpy(host_buffer, device_buffer_ptr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault) print("下载显存值:", host_buffer, " 状态:", err) err, = cudart.cudaFree(device_buffer_ptr) print("释放显存:", err) err, = cudart.cudaDeviceReset() print("重置进程设备环境:", err) def ipc_side(cuda_memory_handle_b64: str): '''https://nvidia.github.io/cuda-python/module/cudart.html''' BUFFER_SIZE = 1024 * np.uint8().itemsize err, = cudart.cudaSetDevice(0) print("设置显卡:", err) new_mem_hdl = cudart.cudaIpcMemHandle_t() new_mem_hdl.reserved = base64.b64decode(cuda_memory_handle_b64) err, devPtr = cudart.cudaIpcOpenMemHandle(new_mem_hdl,cudart.cudaIpcMemLazyEnablePeerAccess) print("设置显存管理句柄:", err, devPtr) host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8) print("初始化主机内存为[0*BUFFER_SIZE]:", host_buffer) err, = cudart.cudaMemcpy(host_buffer, devPtr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault) print("下载显存信息:",host_buffer) host_buffer = np.full(BUFFER_SIZE, 8).astype(np.uint8) err, = cudart.cudaMemcpy(devPtr, host_buffer,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault) print("改变显存的值为[8*1024]:", err) err, = cudart.cudaIpcCloseMemHandle(devPtr) print("关闭IPC显存句柄:", err) err, = cudart.cudaDeviceReset() print("重置进程设备环境:", err) def ipc_side_RTC(cuda_memory_handle_b64: str): '''https://nvidia.github.io/cuda-python/module/cudart.html''' add_a_cu = """\ extern "C" __global__ void add_a(char a, char *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = out[tid] + a; } } """ err, prog = nvrtc.nvrtcCreateProgram(str.encode(add_a_cu), b"add_a_cu.cu", 0, [], []) print("创建程序:",err, " Prog:", prog) # 这里的gpu架构需要与实际显卡的型号对应。(官方文档的示例是'compute_75) # 如果这里的配置不匹配,在cuModuleLoadData的时候会报错:CUDA_ERROR_INVALID_PTX # 硬件型号与架构的对应关系,可以参考文档: https://arnon.dk/category/gpus/ opts = [b"--fmad=false", b"--gpu-architecture=compute_61"] err, = nvrtc.nvrtcCompileProgram(prog, 2, opts) print("编译程序:",err) # Get PTX from compilation err, ptxSize = nvrtc.nvrtcGetPTXSize(prog) print("ptx 文件大小:", ptxSize) ptx = b" " * ptxSize err, = nvrtc.nvrtcGetPTX(prog, ptx) print("获取ptx文件:", err) # print("文件内容:", ptx) err, = cudart.cudaSetDevice(0) print("设置显卡:", err) ptx = np.char.array(ptx) err, module = cuda.cuModuleLoadData(ptx.ctypes.data) print("加载程序:", err, " module:", module) err, kernel = cuda.cuModuleGetFunction(module, b"add_a") print("获取核函数:", err , " Kernel:", kernel) BUFFER_SIZE = 1024 * np.uint8().itemsize new_mem_hdl = cudart.cudaIpcMemHandle_t() new_mem_hdl.reserved = base64.b64decode(cuda_memory_handle_b64) err, devPtr = cudart.cudaIpcOpenMemHandle(new_mem_hdl,cudart.cudaIpcMemLazyEnablePeerAccess) print("设置显存管理句柄:", err, devPtr) NUM_THREADS = 256 # Threads per block NUM_BLOCKS = 4 # Blocks per grid err, stream = cuda.cuStreamCreate(0) print("创建流:", err, " 流:", stream) dOut = np.array([int(devPtr)], dtype=np.uint64) a = 3 a = np.array(a, dtype=np.uint8) n = np.array(BUFFER_SIZE, dtype=np.uint64) args = [a, dOut, n] args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) err, = cuda.cuLaunchKernel( kernel, NUM_BLOCKS, # grid x dim 1, # grid y dim 1, # grid z dim NUM_THREADS, # block x dim 1, # block y dim 1, # block z dim 0, # dynamic shared memory stream, # stream args.ctypes.data, # kernel arguments 0, # extra (ignore) ) print("启动核函数执行:", err) host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8) err, = cudart.cudaMemcpy(host_buffer, devPtr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault) print("核函数运行后显存的值:", err, host_buffer) err, = cuda.cuStreamSynchronize(stream) print("等待流同步:", err) err, = cudart.cudaIpcCloseMemHandle(devPtr) print("关闭IPC显存句柄:", err) err, = cudart.cudaDeviceReset() print("重置进程设备环境:", err) if __name__ == "__main__": argv = sys.argv memory_handle_file = "./memory_handle.txt" if len(argv) != 2: print_cuda_info() else: info = argv[1] if info == 'main': ipc_main(memory_handle_file) elif info == 'memory': with open(memory_handle_file, 'r') as f: cuda_memory_handle_b64 = f.read() ipc_side(cuda_memory_handle_b64) elif info == 'rtc': with open(memory_handle_file, 'r') as f: cuda_memory_handle_b64 = f.read() ipc_side_RTC(cuda_memory_handle_b64)

运行

需要先使用main参数启动主进程

python ipc_python.py main

此进程会申请显存并且把值赋1. 并且把IPC的显存句柄写入文件memory_handle.txt.

python ipc_python.py memory

此进程会通过文件memory_handle.txt获取显存句柄,并且将其值写为8. 运行此程序后,在主程序中按任意键可以打印显存信息,可以观察到显存的值是 8 .

python ipc_python.py rtc

此进程会通过文件memory_handle.txt获取显存句柄,并且通过rtc编译的核函数将值加3. 运行此程序重复运行会重复加3,在主程序中按任意键可以打印显存信息,可以观察到显存的值是计算结果.

如果运行子进程的时候主线程消失.子进程会初始化一个新的显存.

补充

可以通过 cudaEvent 来同步多个进程的信息。相关函数: cudaEventCreate cudaIpcGetEventHandle cudaEventRecord cudaEventDestroy 等。

此示例中由人工同步。

也可以通过其他的进程间通信方式同步运行信息。

参考资料

Nvidia 硬件与架构的查询: https://arnon.dk/category/gpus/

cuda-python 源码: https://github.com/NVIDIA/cuda-python

cuda-python文档: https://nvidia.github.io/cuda-python/index.html

CUDA-UVA: https://developer.download.nvidia.cn/CUDA/training/cuda_webinars_GPUDirect_uva.pdf