CUDA 显存IPC (Python版)

nxdong July 23, 2022 [python] #cuda

基于cuda的跨进程访问同一块显存的方案(显存IPC). c/c++的sample已经有很多啦,这里提供一个python版的.

前置准备

  1. 有个nvidia显卡
  2. 安装了显卡驱动
  3. 安装了cuda
  4. 安装了Python3

此功能需要显卡与cuda支持Unified Virtual Addressing(UVA).

安装依赖

pip3 install -i https://pypi.douban.com/simple/ cuda-python
pip3 install -i https://pypi.douban.com/simple/ numpy

代码

from cuda import cuda, nvrtc, cudart
import numpy as np
import base64
import sys

def print_cuda_info():
    print("========== 设备信息 ==========")
    err, device_count = cudart.cudaGetDeviceCount()
    print("设备数量:", device_count)
    
    device_id = 0
    err, prob = cudart.cudaGetDeviceProperties(device_id)
    print("设备信息:", prob)
    print("是否支持统一地址空间(UVA):", prob.unifiedAddressing)
    print("设备计算模式:", prob.computeMode)
    if device_count > 1:
        # 只有大于两张卡的时候才能判断是否支持peerAccess, 否则返回 cudaError_t.cudaErrorPeerAccessUnsupported: 217
        err = cudart.cudaDeviceEnablePeerAccess(0,1)
        err, can = cudart.cudaDeviceCanAccessPeer(0,1)
        print("can access:", can)
    print("==============================")

def ipc_main(memory_handle_file):
    BUFFER_SIZE = 1024 * np.uint8().itemsize
    print("BUFFER_SIZE:",BUFFER_SIZE)
    
    err,  = cudart.cudaSetDevice(0)
    print("指定设备ID: ", err)
    
    host_buffer = np.full(BUFFER_SIZE, 1).astype(np.uint8)
    print("主机内存:", host_buffer)
    
    err, device_buffer_ptr = cudart.cudaMalloc(BUFFER_SIZE)
    print("申请显存:", device_buffer_ptr)
    
    err, ipc_mem_handle = cudart.cudaIpcGetMemHandle(device_buffer_ptr)
    memory_handle_str = base64.b64encode(ipc_mem_handle.reserved).decode('utf-8')
    print("获取IPC显存管理句柄:", memory_handle_str)
    with open(memory_handle_file, 'w') as f:
        f.write(memory_handle_str)
    print("显存IPC句柄写入文件:", memory_handle_file)
    
    err, = cudart.cudaMemcpy(device_buffer_ptr, host_buffer,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
    print("上传内容到显存:", err)
    
    a = input("按任意键获取子线程上传的显存值...")
    
    print("开始获取子程序设置的显存值...")
    host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8)
    print("初始化全零的主机内存:", host_buffer)
    err, = cudart.cudaMemcpy(host_buffer, device_buffer_ptr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
    print("下载显存值:", host_buffer, " 状态:", err)
    
    err, = cudart.cudaFree(device_buffer_ptr)
    print("释放显存:", err)
    
    err, = cudart.cudaDeviceReset()
    print("重置进程设备环境:", err)
    
    
    
    
def ipc_side(cuda_memory_handle_b64: str):
    '''https://nvidia.github.io/cuda-python/module/cudart.html'''
    BUFFER_SIZE = 1024 * np.uint8().itemsize
    
    err,  = cudart.cudaSetDevice(0)
    print("设置显卡:", err)
    
    new_mem_hdl = cudart.cudaIpcMemHandle_t()
    new_mem_hdl.reserved = base64.b64decode(cuda_memory_handle_b64)
    err, devPtr =  cudart.cudaIpcOpenMemHandle(new_mem_hdl,cudart.cudaIpcMemLazyEnablePeerAccess)
    print("设置显存管理句柄:", err, devPtr)
    
    host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8)
    print("初始化主机内存为[0*BUFFER_SIZE]:", host_buffer)
    
    err, = cudart.cudaMemcpy(host_buffer, devPtr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
    print("下载显存信息:",host_buffer)
    
    host_buffer = np.full(BUFFER_SIZE, 8).astype(np.uint8)
    err, = cudart.cudaMemcpy(devPtr, host_buffer,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
    print("改变显存的值为[8*1024]:", err)
    
    err, = cudart.cudaIpcCloseMemHandle(devPtr)
    print("关闭IPC显存句柄:", err)
    
    err, = cudart.cudaDeviceReset()
    print("重置进程设备环境:", err)
    
   
def ipc_side_RTC(cuda_memory_handle_b64: str):
    '''https://nvidia.github.io/cuda-python/module/cudart.html'''
  
    
    add_a_cu = """\
extern "C" __global__
void add_a(char a, char *out, size_t n)
{
 size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 if (tid < n) {
   out[tid] = out[tid] + a;
 }
}
"""
    err, prog = nvrtc.nvrtcCreateProgram(str.encode(add_a_cu), b"add_a_cu.cu", 0, [], [])
    print("创建程序:",err, " Prog:", prog)

    
    # 这里的gpu架构需要与实际显卡的型号对应。(官方文档的示例是'compute_75)
    # 如果这里的配置不匹配,在cuModuleLoadData的时候会报错:CUDA_ERROR_INVALID_PTX
    # 硬件型号与架构的对应关系,可以参考文档: https://arnon.dk/category/gpus/
    opts = [b"--fmad=false", b"--gpu-architecture=compute_61"]
    err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
    print("编译程序:",err)
    

    # Get PTX from compilation
    err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
    print("ptx 文件大小:", ptxSize)
    ptx = b" " * ptxSize
    err, = nvrtc.nvrtcGetPTX(prog, ptx)
    print("获取ptx文件:", err)
    # print("文件内容:", ptx)
    
    err,  = cudart.cudaSetDevice(0)
    print("设置显卡:", err)
    
    ptx = np.char.array(ptx)
    err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
    print("加载程序:", err, " module:", module)
    
    err, kernel = cuda.cuModuleGetFunction(module, b"add_a")
    print("获取核函数:", err , " Kernel:", kernel)
    
    BUFFER_SIZE = 1024 * np.uint8().itemsize
    
    new_mem_hdl = cudart.cudaIpcMemHandle_t()
    new_mem_hdl.reserved = base64.b64decode(cuda_memory_handle_b64)
    err, devPtr =  cudart.cudaIpcOpenMemHandle(new_mem_hdl,cudart.cudaIpcMemLazyEnablePeerAccess)
    print("设置显存管理句柄:", err, devPtr)
    
    NUM_THREADS = 256  # Threads per block
    NUM_BLOCKS = 4 # Blocks per grid
    err, stream = cuda.cuStreamCreate(0)
    print("创建流:", err, " 流:", stream)
    
    dOut = np.array([int(devPtr)], dtype=np.uint64)
    a = 3
    a = np.array(a, dtype=np.uint8)
    n = np.array(BUFFER_SIZE, dtype=np.uint64)
    args = [a, dOut, n]
    args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
    
    err, = cuda.cuLaunchKernel(
        kernel,
        NUM_BLOCKS,  # grid x dim
        1,  # grid y dim
        1,  # grid z dim
        NUM_THREADS,  # block x dim
        1,  # block y dim
        1,  # block z dim
        0,  # dynamic shared memory
        stream,  # stream
        args.ctypes.data,  # kernel arguments
        0,  # extra (ignore)
        )
    
    print("启动核函数执行:", err)
    host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8)
    err, = cudart.cudaMemcpy(host_buffer, devPtr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
    print("核函数运行后显存的值:", err, host_buffer)
    
    err, = cuda.cuStreamSynchronize(stream)
    print("等待流同步:", err)
    
    err, = cudart.cudaIpcCloseMemHandle(devPtr)
    print("关闭IPC显存句柄:", err)

    err, = cudart.cudaDeviceReset()
    print("重置进程设备环境:", err)
    

if __name__ == "__main__":
    argv = sys.argv
    memory_handle_file = "./memory_handle.txt"
    
    if len(argv) != 2:
        print_cuda_info()
    else:
        info =  argv[1]
        if info == 'main':
            ipc_main(memory_handle_file)
        elif info == 'memory':
            with open(memory_handle_file, 'r') as f:
                cuda_memory_handle_b64 = f.read()
            ipc_side(cuda_memory_handle_b64)
        elif info == 'rtc': 
            with open(memory_handle_file, 'r') as f:
                cuda_memory_handle_b64 = f.read()
            ipc_side_RTC(cuda_memory_handle_b64)

运行

需要先使用main参数启动主进程

python ipc_python.py main

此进程会申请显存并且把值赋1. 并且把IPC的显存句柄写入文件memory_handle.txt.

python ipc_python.py memory

此进程会通过文件memory_handle.txt获取显存句柄,并且将其值写为8. 运行此程序后,在主程序中按任意键可以打印显存信息,可以观察到显存的值是 8 .

python ipc_python.py rtc

此进程会通过文件memory_handle.txt获取显存句柄,并且通过rtc编译的核函数将值加3. 运行此程序重复运行会重复加3,在主程序中按任意键可以打印显存信息,可以观察到显存的值是计算结果.

如果运行子进程的时候主线程消失.子进程会初始化一个新的显存.

补充

可以通过 cudaEvent 来同步多个进程的信息。相关函数: cudaEventCreate cudaIpcGetEventHandle cudaEventRecord cudaEventDestroy 等。

此示例中由人工同步。

也可以通过其他的进程间通信方式同步运行信息。

参考资料

Nvidia 硬件与架构的查询: https://arnon.dk/category/gpus/

cuda-python 源码: https://github.com/NVIDIA/cuda-python

cuda-python文档: https://nvidia.github.io/cuda-python/index.html

CUDA-UVA: https://developer.download.nvidia.cn/CUDA/training/cuda_webinars_GPUDirect_uva.pdf