PyOpenCl Ядро в Loop Crashes GPU

Я пишу соседи искать рутину, которая является грубой силой с использованием pypopencl. Позже он будет вписываться в мой сглаженный гидроцикл частиц. Грубая сила, конечно, не эффективна, но ее простая и ее отправная точка. Я тестировал свое ядро поиска, и обнаружил, что когда я запускаю его в цикле, он падает. Я не получаю никаких сообщений об ошибках в python, но экран мерцает, а затем возвращается с запиской о том, что графические драйверы не удались, но были восстановлены. Странно то, что если количество частиц, которые просматриваются, невелико (~ 1000 или меньше), то это отлично. Если я увеличиваю счет (~ 10k), он сработает. Я попытался добавить барьеры и команды ожидания, а команда завершения - безрезультатно. Я проверил, есть ли у меня массив переполнения, но я не могу его найти. Я включаю соответствующий код и извиняюсь заранее за его размер, но хотел дать ему все, чтобы люди могли смотреть на него. Я надеюсь, что кто-то сможет запустить это и воссоздать ошибку, или сказать мне, где я ошибаюсь. Моя настройка - python 3.5 с использованием spyder и установленного pyopencl 2016.1.PyOpenCl Ядро в Loop Crashes GPU

Спасибо,

Сет

Первый Основной файл

import numpy as np 
import gpuParameters as gpuParameters 
import pyopencl as cl 
import pyopencl.array as ar 
from BruteForceSearch import BruteForceSearch 
import time as time 

dim = 3 # dimensions of the problem 
n = 15000 # number of particles 
nbs = 50 # number of neighbors 
x = np.random.rand(n) # randomly choose some x 
y = np.random.rand(n) # randomly choose some y 
z = np.random.rand(n) # randomly choose some z 
h = np.ones(n) # smoothing parameter for the b spline 

# setup gpu context 
gpu = gpuParameters.gpuParameters() 
# neighbor list 
nlist = -1*np.ones(n*nbs, dtype=np.int32) 

# data to gpu 
xg = ar.to_device(gpu.queue, x) # x pos on gpu 
yg = ar.to_device(gpu.queue, y) # y pos on gpu 
zg = ar.to_device(gpu.queue, z) # z pos on gpu 
hg = ar.to_device(gpu.queue, h) # h pos on gpu 
num_p = ar.to_device(gpu.queue, np.array(n, dtype=np.int32)) # num of particles 
nb = ar.to_device(gpu.queue, np.array(nbs, dtype=np.int32)) # num of neighbors 
nlst = ar.to_device(gpu.queue, nlist) # neighbor list on gpu 
dg = ar.to_device(gpu.queue, np.array(dim, dtype=np.int32)) # dimension on gpu 
out = ar.zeros(gpu.queue, n, np.float64) # debug parameter 

# call the Brute force neighbor search and h parameter set class 
srch = BruteForceSearch(gpu) # instatiate 
s = time.time() # timer start 
for ii in range(100): 
    # set a marker I really didn't think this would be necessary 
    mark = cl.enqueue_marker(gpu.queue) # set a marker for kernel complete 
    srch.search.search(gpu.queue, x.shape, None, 
         num_p.data, nb.data, dg.data, xg.data, yg.data, zg.data, 
         hg.data, nlst.data, out.data) # run the kernel 
    cl.Event.wait(mark) # wait for complete run of kernel before next iteration 
    # gpu.queue.finish() 
    print('iteration: ', ii) # print iteration time to show me its running 
e = time.time() # end the timer 
cs = time.time() # clock the time it takes to return the array 
nlist = nlst.get() 
ce = time.time() 
# output the times 
print('time to calculate: ', e-s) 
print('time to copy back: ', ce - cs)

GPU Контекст класса

import pyopencl as cl 

class gpuParameters: 
    def __init__(self, dType = []): 
     #will setup the proper context based on given device preference 
     #if no device perference given will default to first value 
     if dType == []: 
      pltfrms = cl.get_platforms()[0] 
      devices = pltfrms.get_devices(cl.device_type.GPU) 
      context = cl.Context(devices) #create a device context 
     print(context) 
     print(devices) 
     self.cntxt = context#keep this context in motion 
     self.queue = cl.CommandQueue(self.cntxt) #create a command que for this context 
     self.mF = cl.mem_flags

Neighbor Loop до

import numpy as np 
import pyopencl as cl 
import gpu_sph_assistance_functions as gsaf 

class BruteForceSearch: 
    def __init__(self, gpu): 
     # instantiation of the search routine primarilly for pre compiling of 
     # the function 
     self.gpu = gpu # save the gpu context 

     # setup and compile the search 
     self.bruteSearch() 

def bruteSearch(self): 
    W = gsaf.gpu_sph_kernel() 
    self.search = cl.Program(
     self.gpu.cntxt, 
     W + '''__kernel void search(__global int *nP, __global int *nN, 
           __global int *dim, 
          __global double *x, __global double *y, 
          __global double *z, __global double *h, 
          __global int *nlist, __global double *out) 
     { 
      // indices 
      int gid = get_global_id(0); // current particle 
      int idv = 0; // unrolled array id 
      int count = 0; // count 
      int dm = *dim; // problem dimension 
      int itr = 0; // start iteration 
      int mxitr = 25; // max number of iterations 
      // calculate variables 
      double dms = 1.0/(*dim); // 1 over dimension for pow 
      double xi = x[gid]; // current x position 
      double yi = y[gid]; // current y position 
      double zi = z[gid]; // current z position 
      double dx = 0; // difference in x 
      double dy = 0; // difference in y 
      double dz = 0; // difference in z 
      double r = 0; // radius 
      double hg = h[gid]; // smoothing parametre 
      double Wsum = 0; // sum of weights 
      double W = 0; // current weight 
      double dwdx = 0; // derivative of weight in x direction 
      double dwdy = 0; // derivative of weight in y direction 
      double dwdz = 0; // derivative of weight in z direction 
      double dwdr = 0; // derivative of weight in r direction 
      double V = 0; // Volume of particle 
      double hn = 0; // holding value for comparison 
      double err = 10; // error 
      double tol = 1e-7; // tolerance 
      double diff = 0; // difference 

      // first clean the array of neighbors 
      for (int ii = 0; ii < *nN; ii++) // length of num of neighbors 
      { 
       idv = *nN*gid + ii; // unrolled index 
       nlist[idv] = -1; // this is a trigger for excluding values 
      } 
      // Next calculate the h parameter 
      while (err > tol) 
      { 
       Wsum = 0; // clean summation 
       for (int jj = 0; jj < *nP; jj++) // loop over all particles 
       { 
        dx = xi - x[jj]; 
        dy = yi - y[jj]; 
        dz = zi - z[jj]; 
        // spline for weights 
        quintic_spline(dm, hg, dx, dy, dz, &W, 
            &dwdx, &dwdy, &dwdz, &dwdr); 
        Wsum += W; // add to store 
       } 
       V = 1.0/Wsum; /// volume 
       hn = pow(V, dms); // new h parameter 
       diff = hn - hg; // difference 
       err = fabs(diff); // error 
       out[gid] = err; // store error for debug 
       hg = hn; // reset h 
       itr ++; // update iter 
       if (itr > mxitr) // break out 
       { break; } 
      } 
      h[gid] = hg; // store h 

      /* // get all neighbors in vicinity of particle not 
      // currently assessed 
      for(int ii = 0; ii < *nP; ii++) 
      { 
       dx = xi - x[ii]; 
       dy = yi - y[ii]; 
       dz = zi - z[ii]; 
       r = sqrt(dx*dx + dy*dy + dz*dz); 
       if (r < 3.25*hg & count < *nN) 
       { 
        idv = *nN*gid + count; 
        nlist[idv] = ii; 
        count++; 
       } 
      } 
      */ 

    } 
     ''').build()

Функция сплайн для взвешивания

W = '''void quintic_spline(
     int dim, double h, double dx, double dy, double dz, double *W, 
     double *dWdx, double *dWdy, double *dWdz, double *dWdrO) 
     { 
      double pi = 3.141592654; // pi 
      double m3q = 0; // prefix values 
      double m2q = 0; // prefix values 
      double m1q = 0; // prefix values 
      double T1 = 0; // prefix values 
      double T2 = 0; // prefix values 
      double T3 = 0; // prefix values 
      double D1 = 0; // prefix values 
      double D2 = 0; // prefix values 
      double D3 = 0; // prefix values 
      double Ch = 0; // normalizing parameter for kernel 
      double C = 0; // normalizing prior to h 
      double r = sqrt(dx*dx + dy*dy + dz*dz); 
      double q = r/h; // normalized radius 
      double dqdr = 1.0/h; // intermediate derivative 
      double dWdq = 0; // intermediate derivative 
      double dWdr = 0; // intermediate derivative 
      double drdx = dx/r; // intermediate derivative 
      double drdy = dy/r; // intermediate derivative 
      double drdz = dz/r; // intermediate derivative 
      if (dim == 1) 
      { 
       C = 1.0/120.0; 
      } 
      else if (dim == 2) 
      { 
       C = 7.0/(pi*478.0); 
      } 
      else if (dim == 3) 
      { 
       C = 1.0/(120.0*pi); 
      } 
      Ch = C/pow(h, dim); 
      if (r <= 0) 
      { 
       drdx = 0.0; 
       drdy = 0.0; 
       drdz = 0.0; 
      } 

      // local prefix constants 
      m1q = 1.0 - q; 
      m2q = 2.0 - q; 
      m3q = 3.0 - q; 

      // smoothing parameter constants 
      T1 = Ch*pow(m3q, 5); 
      T2 = -6.0*Ch*pow(m2q, 5); 
      T3 = 15.0*Ch*pow(m1q, 5); 

      //derivative of spline coefficients 
      D1 = -5.0*Ch*pow(m3q,4); 
      D2 = 30.0*Ch*pow(m2q,4); 
      D3 = -75.0*Ch*pow(m1q,4); 

      // W calculation 
      if (q < 1.0) 
      { 
       *W = T1 + T2 + T3; 
       dWdq = D1 + D2 + D3; 
      } 
      else if (q >= 1.0 && q < 2.0) 
      { 
       *W = T1 + T2; 
       dWdq = D1 + D2; 
      } 
      else if (q >= 2.0 && q < 3.0) 
      { 
       *W = T1; 
       dWdq = D1; 
      } 
      else 
      { 
       *W = 0.0; 
       dWdq = 0.0; 
      } 
      dWdr = dWdq*dqdr; 
      // assign the derivatives 
      *dWdx = dWdr*drdx; 
      *dWdy = dWdr*drdy; 
      *dWdz = dWdr*drdz; 
      *dWdrO = dWdr; 
     }'''

источник

2016-10-20 Dr Mobieus

Я тестировал код на процессоре Intel i7-4790K с AMD Accelerated Parallel Processing. Он не падает при n = 150000 (я запускаю только одну итерацию). Единственная странная вещь, которую я обнаружил при быстром просмотре кода, заключалась в том, что ядро читает и записывает в массив h. Это не должно быть проблемой, но я обычно стараюсь избегать этого.

источник

2016-10-27 13:26:10 serbap

Я вернусь и попытаюсь сделать это как два отдельных массива, один для ввода одного для вывода. Может решить проблему. Спасибо, что посмотрели на него. –

ответ

Смежные вопросы