nxdong July 23, 2022 [python] #cuda
基于cuda的跨进程访问同一块显存的方案(显存IPC).
c/c++的sample已经有很多啦,这里提供一个python版的.
前置准备
- 有个nvidia显卡
- 安装了显卡驱动
- 安装了cuda
- 安装了Python3
此功能需要显卡与cuda支持Unified Virtual Addressing(UVA)
.
安装依赖
pip3 install -i https://pypi.douban.com/simple/ cuda-python
pip3 install -i https://pypi.douban.com/simple/ numpy
代码
from cuda import cuda, nvrtc, cudart
import numpy as np
import base64
import sys
def print_cuda_info():
print("========== 设备信息 ==========")
err, device_count = cudart.cudaGetDeviceCount()
print("设备数量:", device_count)
device_id = 0
err, prob = cudart.cudaGetDeviceProperties(device_id)
print("设备信息:", prob)
print("是否支持统一地址空间(UVA):", prob.unifiedAddressing)
print("设备计算模式:", prob.computeMode)
if device_count > 1:
err = cudart.cudaDeviceEnablePeerAccess(0,1)
err, can = cudart.cudaDeviceCanAccessPeer(0,1)
print("can access:", can)
print("==============================")
def ipc_main(memory_handle_file):
BUFFER_SIZE = 1024 * np.uint8().itemsize
print("BUFFER_SIZE:",BUFFER_SIZE)
err, = cudart.cudaSetDevice(0)
print("指定设备ID: ", err)
host_buffer = np.full(BUFFER_SIZE, 1).astype(np.uint8)
print("主机内存:", host_buffer)
err, device_buffer_ptr = cudart.cudaMalloc(BUFFER_SIZE)
print("申请显存:", device_buffer_ptr)
err, ipc_mem_handle = cudart.cudaIpcGetMemHandle(device_buffer_ptr)
memory_handle_str = base64.b64encode(ipc_mem_handle.reserved).decode('utf-8')
print("获取IPC显存管理句柄:", memory_handle_str)
with open(memory_handle_file, 'w') as f:
f.write(memory_handle_str)
print("显存IPC句柄写入文件:", memory_handle_file)
err, = cudart.cudaMemcpy(device_buffer_ptr, host_buffer,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
print("上传内容到显存:", err)
a = input("按任意键获取子线程上传的显存值...")
print("开始获取子程序设置的显存值...")
host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8)
print("初始化全零的主机内存:", host_buffer)
err, = cudart.cudaMemcpy(host_buffer, device_buffer_ptr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
print("下载显存值:", host_buffer, " 状态:", err)
err, = cudart.cudaFree(device_buffer_ptr)
print("释放显存:", err)
err, = cudart.cudaDeviceReset()
print("重置进程设备环境:", err)
def ipc_side(cuda_memory_handle_b64: str):
BUFFER_SIZE = 1024 * np.uint8().itemsize
err, = cudart.cudaSetDevice(0)
print("设置显卡:", err)
new_mem_hdl = cudart.cudaIpcMemHandle_t()
new_mem_hdl.reserved = base64.b64decode(cuda_memory_handle_b64)
err, devPtr = cudart.cudaIpcOpenMemHandle(new_mem_hdl,cudart.cudaIpcMemLazyEnablePeerAccess)
print("设置显存管理句柄:", err, devPtr)
host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8)
print("初始化主机内存为[0*BUFFER_SIZE]:", host_buffer)
err, = cudart.cudaMemcpy(host_buffer, devPtr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
print("下载显存信息:",host_buffer)
host_buffer = np.full(BUFFER_SIZE, 8).astype(np.uint8)
err, = cudart.cudaMemcpy(devPtr, host_buffer,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
print("改变显存的值为[8*1024]:", err)
err, = cudart.cudaIpcCloseMemHandle(devPtr)
print("关闭IPC显存句柄:", err)
err, = cudart.cudaDeviceReset()
print("重置进程设备环境:", err)
def ipc_side_RTC(cuda_memory_handle_b64: str):
add_a_cu = """\
extern "C" __global__
void add_a(char a, char *out, size_t n)
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
out[tid] = out[tid] + a;
}
}
"""
err, prog = nvrtc.nvrtcCreateProgram(str.encode(add_a_cu), b"add_a_cu.cu", 0, [], [])
print("创建程序:",err, " Prog:", prog)
opts = [b"--fmad=false", b"--gpu-architecture=compute_61"]
err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
print("编译程序:",err)
err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
print("ptx 文件大小:", ptxSize)
ptx = b" " * ptxSize
err, = nvrtc.nvrtcGetPTX(prog, ptx)
print("获取ptx文件:", err)
err, = cudart.cudaSetDevice(0)
print("设置显卡:", err)
ptx = np.char.array(ptx)
err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
print("加载程序:", err, " module:", module)
err, kernel = cuda.cuModuleGetFunction(module, b"add_a")
print("获取核函数:", err , " Kernel:", kernel)
BUFFER_SIZE = 1024 * np.uint8().itemsize
new_mem_hdl = cudart.cudaIpcMemHandle_t()
new_mem_hdl.reserved = base64.b64decode(cuda_memory_handle_b64)
err, devPtr = cudart.cudaIpcOpenMemHandle(new_mem_hdl,cudart.cudaIpcMemLazyEnablePeerAccess)
print("设置显存管理句柄:", err, devPtr)
NUM_THREADS = 256 NUM_BLOCKS = 4 err, stream = cuda.cuStreamCreate(0)
print("创建流:", err, " 流:", stream)
dOut = np.array([int(devPtr)], dtype=np.uint64)
a = 3
a = np.array(a, dtype=np.uint8)
n = np.array(BUFFER_SIZE, dtype=np.uint64)
args = [a, dOut, n]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
err, = cuda.cuLaunchKernel(
kernel,
NUM_BLOCKS, 1, 1, NUM_THREADS, 1, 1, 0, stream, args.ctypes.data, 0, )
print("启动核函数执行:", err)
host_buffer = np.full(BUFFER_SIZE, 0).astype(np.uint8)
err, = cudart.cudaMemcpy(host_buffer, devPtr,BUFFER_SIZE, cudart.cudaMemcpyKind.cudaMemcpyDefault)
print("核函数运行后显存的值:", err, host_buffer)
err, = cuda.cuStreamSynchronize(stream)
print("等待流同步:", err)
err, = cudart.cudaIpcCloseMemHandle(devPtr)
print("关闭IPC显存句柄:", err)
err, = cudart.cudaDeviceReset()
print("重置进程设备环境:", err)
if __name__ == "__main__":
argv = sys.argv
memory_handle_file = "./memory_handle.txt"
if len(argv) != 2:
print_cuda_info()
else:
info = argv[1]
if info == 'main':
ipc_main(memory_handle_file)
elif info == 'memory':
with open(memory_handle_file, 'r') as f:
cuda_memory_handle_b64 = f.read()
ipc_side(cuda_memory_handle_b64)
elif info == 'rtc':
with open(memory_handle_file, 'r') as f:
cuda_memory_handle_b64 = f.read()
ipc_side_RTC(cuda_memory_handle_b64)
运行
需要先使用main参数启动主进程
python ipc_python.py main
此进程会申请显存并且把值赋1. 并且把IPC的显存句柄写入文件memory_handle.txt
.
python ipc_python.py memory
此进程会通过文件memory_handle.txt
获取显存句柄,并且将其值写为8. 运行此程序后,在主程序中按任意键可以打印显存信息,可以观察到显存的值是 8 .
python ipc_python.py rtc
此进程会通过文件memory_handle.txt
获取显存句柄,并且通过rtc编译的核函数将值加3. 运行此程序重复运行会重复加3,在主程序中按任意键可以打印显存信息,可以观察到显存的值是计算结果.
如果运行子进程的时候主线程消失.子进程会初始化一个新的显存.
补充
可以通过 cudaEvent 来同步多个进程的信息。相关函数: cudaEventCreate
cudaIpcGetEventHandle
cudaEventRecord
cudaEventDestroy
等。
此示例中由人工
同步。
也可以通过其他的进程间通信方式同步运行信息。
参考资料
Nvidia 硬件与架构的查询: https://arnon.dk/category/gpus/
cuda-python 源码: https://github.com/NVIDIA/cuda-python
cuda-python文档: https://nvidia.github.io/cuda-python/index.html
CUDA-UVA: https://developer.download.nvidia.cn/CUDA/training/cuda_webinars_GPUDirect_uva.pdf