Я использую PyCuda для передачи пар массивов в ядро cuda с помощью указателя. Массивы - это вывод другого ядра, поэтому данные уже находятся на графическом процессоре.PyCuda: элемент разыменования массива через указатель в ядре Cuda
В ядре я пытаюсь получить доступ к элементам в каждом из массивов, чтобы выполнить векторное вычитание. Значения, которые я получаю для элементов в массиве, неверны (h & p неверны в коде ниже).
Может ли кто-нибудь помочь мне увидеть, что я делаю неправильно?
Мой код:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time
import cv2
from pycuda.tools import DeviceMemoryPool as DMP
from scipy.spatial import distance
import os
import glob
def get_cuda_hist_kernel():
#Make the kernel
histogram_kernel = """
__global__ void kernel_getHist(unsigned int* array,unsigned int size, unsigned int* histo, float bucket_size, unsigned int num_bins, unsigned int* out_max)
{
unsigned int x = threadIdx.x + blockDim.x * blockIdx.x;
if(x<size){
unsigned int value = array[x];
unsigned int bin = floor(float(value) * bucket_size) - 1;
//Faster Modulo 3 for channel assignment
unsigned int offset = x;
offset = (offset >> 16) + (offset & 0xFFFF);
offset = (offset >> 8) + (offset & 0xFF);
offset = (offset >> 4) + (offset & 0xF);
offset = (offset >> 2) + (offset & 0x3);
offset = (offset >> 2) + (offset & 0x3);
offset = (offset >> 2) + (offset & 0x3);
if (offset > 2) offset = offset - 3;
offset = offset * num_bins;
bin += offset;
atomicAdd(&histo[bin + offset],1);
}
}
__global__ void kernel_chebyshev(unsigned int* histo, unsigned int* prev_histo, unsigned int number, int* output)
{
const unsigned int size = 12;
//Get all of the differences
__shared__ int temp_diffs[size];
unsigned int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < size){
unsigned int diff = 0;
unsigned int h = histo[i];
unsigned int p = prev_histo[i];
if (h > p)
{
diff = h - p;
}
else
{
diff = p - h;
}
temp_diffs[i] = (int)diff;
}
__syncthreads();
output[number] = 0;
atomicMax(&output[number], temp_diffs[i]);
}
"""
mod = SourceModule(histogram_kernel)
return mod
def cuda_histogram(ims, block_size, kernel):
start = time.time()
max_val = 4
num_bins = np.uint32(4)
num_channels = np.uint32(3)
bin_size = np.float32(1/np.uint32(max_val/num_bins))
#Memory Pool
pool = DMP()
print 'Pool Held Blocks: ', pool.held_blocks
#Compute block & Grid dimensions
bdim = (block_size, 1, 1)
cols = ims[0].size
rows = 1
channels = 1
dx, mx = divmod(cols, bdim[0])
dy, my = divmod(rows, bdim[1])
dz, mz = divmod(channels, bdim[2])
g_x = (dx + (mx>0)) * bdim[0]
g_y = (dy + (my>0)) * bdim[1]
g_z = (dz + (mz>0)) * bdim[2]
gdim = (g_x, g_y, g_z)
#get the function
func = kernel.get_function('kernel_getHist')
func2 = kernel.get_function('kernel_chebyshev')
#build list of histograms
#send the histogram to the gpu
hists = []
device_hists = []
for im in range(len(ims)):
hists.append(np.zeros([num_channels * num_bins]).astype(np.uint32))
end = time.time()
dur = end - start
print(' '.join(['Prep Time: ', str(dur)]))
start = time.time()
#Copy all of the image data to GPU
device_images = []
for im in range(len(ims)):
#print('Allocating data for image :', im)
#convert the image to 1D array of uint32s
a = ims[im].astype(np.uint32)
a = a.flatten('C')
a_size = np.uint32(a.size)
#allocate & send im data to gpu
device_images.append(pool.allocate(a.nbytes))
cuda.memcpy_htod(device_images[im], a)
d_hist = pool.allocate(hists[im].nbytes)
device_hists.append(d_hist)
cuda.memcpy_htod(d_hist, hists[im])
differences = np.zeros(len(ims)).astype(np.uint32)
device_diffs = pool.allocate(differences.nbytes)
cuda.memcpy_htod(device_diffs, differences)
for im in range(len(ims)):
#run histogram function
func(device_images[im], a_size, device_hists[im], bin_size, num_bins, block=(block_size,1,1), grid=gdim)
cuda.Context.synchronize()
device_hist_size = np.uint32(len(device_hists[im]))
for im in range(1, len(ims)):
number = np.uint32(im - 1)
func2(device_hists[im], device_hists[im - 1], number, device_diffs, block=(32,1,1))
cuda.memcpy_dtoh(differences, device_diffs)
print(differences)
for im in range(len(ims)):
#get histogram back
cuda.memcpy_dtoh(hists[im], device_hists[im])
device_hists[im] = 0
end = time.time()
dur = end - start
print(' '.join(['Load, Compute, & Gather Time: ', str(dur)]))
pool.free_held()
return differences
def get_all_files(directory):
pattern = os.path.join(directory, '*.jpg')
files = [f for f in glob.glob(pattern)]
return files
if __name__ == "__main__":
RESOURCES_PATH = "../data/ims/"
MAX_IMS = 1000
direc = os.path.join(RESOURCES_PATH, '21JumpStreet', 'source_video_frames')
files = get_all_files(direc)
a = cv2.imread('t.png')
ims = [cv2.imread(f) for f in files]
print 'Shape of my image: ', ims[0].shape
print 'Number of images to histogram: ', len(ims)
block_size = 128
kernel = get_cuda_hist_kernel()
start = time.time()
num_diffs = len(ims) // MAX_IMS + 1
cuda_diffs = []
for i in range(num_diffs):
first = i * MAX_IMS
last = (i + 1) * MAX_IMS
print(first)
small_set = ims[first:last]
print 'Small set size: ', str(len(small_set))
cuda_diffs.extend(cuda_histogram(small_set, block_size, kernel))
end = time.time()
dur = end - start
print(' '.join(['CUDA version took:', str(dur)]))
start = time.time()
cv_hists = []
for i in range(len(ims)):
im = ims[i % len(ims)]
h = []
for j in range(3):
hist = cv2.calcHist([im], [j], None, [4], [0, 100])
h.extend(hist)
cv_hists.append(h)
#run Chebyshev on CPU:
color_hist_diffs = np.array([distance.chebyshev(cv_hists[i-1], cv_hists[i]) \
for i in range(len(cv_hists)) if i != 0])
print(color_hist_diffs)
end = time.time()
dur = end - start
print(' '.join(['CPU & cv2 version took:', str(dur)]))
@JonnyHenly Добавил остальную часть моего кода ... может быть, слишком много деталей. Дай мне знать. –
Большое вам спасибо за это! Вы не поверите, сколько людей я попросил обновить их вопрос, поэтому я могу попытаться им помочь, и этого никогда не произойдет. Это много кода, хотя, я могу понять, почему вы не опубликовали его полностью:) –
Вы проверили, что массивы, используемые в 'kernel_chebyshev', которые являются выходом другого ядра, верны? –