NEURON
nrn_acc_manager.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================
7 */
8 
9 #include <queue>
10 #include <utility>
11 
25 
26 #ifdef CRAYPAT
27 #include <pat_api.h>
28 #endif
29 
30 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
31 #include <cuda_runtime_api.h>
32 #endif
33 
34 #if __has_include(<cxxabi.h>)
35 #define USE_CXXABI
36 #include <cxxabi.h>
37 #include <memory>
38 #include <string>
39 #endif
40 
41 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
42 #include <cassert>
43 #include <cstddef>
44 #include <iostream>
45 #include <map>
46 #include <shared_mutex>
47 namespace {
48 struct present_table_value {
49  std::size_t ref_count{}, size{};
50  std::byte* dev_ptr{};
51 };
52 std::map<std::byte const*, present_table_value> present_table;
53 std::shared_mutex present_table_mutex;
54 } // namespace
55 #endif
56 
57 namespace {
58 /** @brief Try to demangle a type name, return the mangled name on failure.
59  */
60 std::string cxx_demangle(const char* mangled) {
61 #ifdef USE_CXXABI
62  int status{};
63  // Note that the third argument to abi::__cxa_demangle returns the length of
64  // the allocated buffer, which may be larger than strlen(demangled) + 1.
65  std::unique_ptr<char, decltype(free)*> demangled{
66  abi::__cxa_demangle(mangled, nullptr, nullptr, &status), free};
67  return status ? mangled : demangled.get();
68 #else
69  return mangled;
70 #endif
71 }
72 bool cnrn_target_debug_output_enabled() {
73  const char* env = std::getenv("CORENEURON_GPU_DEBUG");
74  if (!env) {
75  return false;
76  }
77  std::string env_s{env};
78  if (env_s == "1") {
79  return true;
80  } else if (env_s == "0") {
81  return false;
82  } else {
83  throw std::runtime_error("CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s + ")");
84  }
85 }
86 bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
87 } // namespace
88 
89 namespace coreneuron {
90 extern InterleaveInfo* interleave_info;
93 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
95 
96 void cnrn_target_copyin_debug(std::string_view file,
97  int line,
98  std::size_t sizeof_T,
99  std::type_info const& typeid_T,
100  void const* h_ptr,
101  std::size_t len,
102  void* d_ptr) {
103  if (!cnrn_target_enable_debug) {
104  return;
105  }
106  std::cerr << file << ':' << line << ": cnrn_target_copyin<" << cxx_demangle(typeid_T.name())
107  << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T
108  << ") -> " << d_ptr << std::endl;
109 }
110 void cnrn_target_delete_debug(std::string_view file,
111  int line,
112  std::size_t sizeof_T,
113  std::type_info const& typeid_T,
114  void const* h_ptr,
115  std::size_t len) {
116  if (!cnrn_target_enable_debug) {
117  return;
118  }
119  std::cerr << file << ':' << line << ": cnrn_target_delete<" << cxx_demangle(typeid_T.name())
120  << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T << ')'
121  << std::endl;
122 }
123 void cnrn_target_deviceptr_debug(std::string_view file,
124  int line,
125  std::type_info const& typeid_T,
126  void const* h_ptr,
127  void* d_ptr) {
128  if (!cnrn_target_enable_debug) {
129  return;
130  }
131  std::cerr << file << ':' << line << ": cnrn_target_deviceptr<" << cxx_demangle(typeid_T.name())
132  << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
133 }
134 void cnrn_target_is_present_debug(std::string_view file,
135  int line,
136  std::type_info const& typeid_T,
137  void const* h_ptr,
138  void* d_ptr) {
139  if (!cnrn_target_enable_debug) {
140  return;
141  }
142  std::cerr << file << ':' << line << ": cnrn_target_is_present<" << cxx_demangle(typeid_T.name())
143  << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
144 }
145 void cnrn_target_memcpy_to_device_debug(std::string_view file,
146  int line,
147  std::size_t sizeof_T,
148  std::type_info const& typeid_T,
149  void const* h_ptr,
150  std::size_t len,
151  void* d_ptr) {
152  if (!cnrn_target_enable_debug) {
153  return;
154  }
155  std::cerr << file << ':' << line << ": cnrn_target_memcpy_to_device<"
156  << cxx_demangle(typeid_T.name()) << ">(" << d_ptr << ", " << h_ptr << ", " << len
157  << " * " << sizeof_T << " = " << len * sizeof_T << ')' << std::endl;
158 }
159 
160 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
161 std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr) {
162  if (!h_ptr) {
163  return {nullptr, false};
164  }
165  // Concurrent calls to this method are safe, but they must be serialised
166  // w.r.t. calls to the cnrn_target_*_update_present_table methods.
167  std::shared_lock _{present_table_mutex};
168  if (present_table.empty()) {
169  return {nullptr, must_be_present_or_null};
170  }
171  // prev(first iterator greater than h_ptr or last if not found) gives the first iterator less
172  // than or equal to h_ptr
173  auto const iter = std::prev(std::upper_bound(
174  present_table.begin(), present_table.end(), h_ptr, [](void const* hp, auto const& entry) {
175  return hp < entry.first;
176  }));
177  if (iter == present_table.end()) {
178  return {nullptr, must_be_present_or_null};
179  }
180  std::byte const* const h_byte_ptr{static_cast<std::byte const*>(h_ptr)};
181  std::byte const* const h_start_of_block{iter->first};
182  std::size_t const block_size{iter->second.size};
183  std::byte* const d_start_of_block{iter->second.dev_ptr};
184  bool const is_present{h_byte_ptr < h_start_of_block + block_size};
185  if (!is_present) {
186  return {nullptr, must_be_present_or_null};
187  }
188  return {d_start_of_block + (h_byte_ptr - h_start_of_block), false};
189 }
190 
191 void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len) {
192  if (!h_ptr) {
193  assert(!d_ptr);
194  return;
195  }
196  std::lock_guard _{present_table_mutex};
197  // TODO include more pedantic overlap checking?
198  present_table_value new_val{};
199  new_val.size = len;
200  new_val.ref_count = 1;
201  new_val.dev_ptr = static_cast<std::byte*>(d_ptr);
202  auto const [iter, inserted] = present_table.emplace(static_cast<std::byte const*>(h_ptr),
203  std::move(new_val));
204  if (!inserted) {
205  // Insertion didn't occur because h_ptr was already in the present table
206  assert(iter->second.size == len);
207  assert(iter->second.dev_ptr == new_val.dev_ptr);
208  ++(iter->second.ref_count);
209  }
210 }
211 void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len) {
212  if (!h_ptr) {
213  return;
214  }
215  std::lock_guard _{present_table_mutex};
216  auto const iter = present_table.find(static_cast<std::byte const*>(h_ptr));
217  assert(iter != present_table.end());
218  assert(iter->second.size == len);
219  --(iter->second.ref_count);
220  if (iter->second.ref_count == 0) {
221  present_table.erase(iter);
222  }
223 }
224 #endif
225 
227 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
228  defined(_OPENACC)
229  // choose nvidia GPU by default
230  acc_device_t device_type = acc_device_nvidia;
231  // check how many gpu devices available per node
232  return acc_get_num_devices(device_type);
233 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
234  defined(_OPENMP)
235  return omp_get_num_devices();
236 #else
237  throw std::runtime_error(
238  "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
239 #endif
240 }
241 
242 void cnrn_target_set_default_device(int device_num) {
243 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
244  defined(_OPENACC)
245  acc_set_device_num(device_num, acc_device_nvidia);
246 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
247  defined(_OPENMP)
248  omp_set_default_device(device_num);
249  // It seems that with NVHPC 21.9 then only setting the default OpenMP device
250  // is not enough: there were errors on some nodes when not-the-0th GPU was
251  // used. These seemed to be related to the NMODL instance structs, which are
252  // allocated using cudaMallocManaged.
253  auto const cuda_code = cudaSetDevice(device_num);
254  assert(cuda_code == cudaSuccess);
255 #else
256  throw std::runtime_error(
257  "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
258 #endif
259 }
260 
261 #ifdef CORENEURON_ENABLE_GPU
262 #ifndef CORENEURON_UNIFIED_MEMORY
263 static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
264  // As we never run code for artificial cell inside GPU we don't copy it.
265  int is_art = corenrn.get_is_artificial()[type];
266  if (is_art) {
267  return nullptr;
268  }
269 
270  auto d_ml = cnrn_target_copyin(ml);
271 
272  if (ml->global_variables) {
273  assert(ml->global_variables_size);
274  void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
275  ml->global_variables_size);
276  cnrn_target_memcpy_to_device(&(d_ml->global_variables), &d_inst);
277  }
278 
279 
280  int n = ml->nodecount;
281  int szp = corenrn.get_prop_param_size()[type];
282  int szdp = corenrn.get_prop_dparam_size()[type];
283 
284  double* dptr = cnrn_target_deviceptr(ml->data);
285  cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
286 
287 
288  int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n);
289  cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
290 
291  if (szdp) {
292  int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
293  int* d_pdata = cnrn_target_copyin(ml->pdata, pcnt);
294  cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
295  }
296 
297  int ts = corenrn.get_memb_funcs()[type].thread_size_;
298  if (ts) {
299  ThreadDatum* td = cnrn_target_copyin(ml->_thread, ts);
300  cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
301  }
302 
303  // net_receive buffer associated with mechanism
304  NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
305 
306  // if net receive buffer exist for mechanism
307  if (nrb) {
308  NetReceiveBuffer_t* d_nrb = cnrn_target_copyin(nrb);
309  cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);
310 
311  int* d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
312  cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
313 
314  int* d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
315  cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
316 
317  double* d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
318  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
319 
320  double* d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
321  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
322 
323  int* d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
324  cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
325 
326  int* d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
327  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
328  }
329 
330  /* copy NetSendBuffer_t on to GPU */
331  NetSendBuffer_t* nsb = ml->_net_send_buffer;
332 
333  if (nsb) {
334  NetSendBuffer_t* d_nsb;
335  int* d_iptr;
336  double* d_dptr;
337 
338  d_nsb = cnrn_target_copyin(nsb);
339  cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);
340 
341  d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
342  cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);
343 
344  d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
345  cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);
346 
347  d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
348  cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);
349 
350  d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
351  cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);
352 
353  d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
354  cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);
355 
356  d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
357  cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
358  }
359 
360  return d_ml;
361 }
362 #endif
363 
364 static void update_ml_on_host(const Memb_list* ml, int type) {
365  int is_art = corenrn.get_is_artificial()[type];
366  if (is_art) {
367  // Artificial mechanisms such as PatternStim and IntervalFire
368  // are not copied onto the GPU. They should not, therefore, be
369  // updated from the GPU.
370  return;
371  }
372 
373  int n = ml->nodecount;
374  int szp = corenrn.get_prop_param_size()[type];
375  int szdp = corenrn.get_prop_dparam_size()[type];
376 
377  int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
378 
379  nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n]))
380  nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n]))
381 
382  int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
383  nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
384  nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))
385 
386  auto nrb = ml->_net_receive_buffer;
387 
388  // clang-format off
389  nrn_pragma_acc(update self(nrb->_cnt,
390  nrb->_size,
391  nrb->_pnt_offset,
392  nrb->_displ_cnt,
393  nrb->_pnt_index[:nrb->_size],
394  nrb->_weight_index[:nrb->_size],
395  nrb->_displ[:nrb->_size + 1],
396  nrb->_nrb_index[:nrb->_size])
397  if (nrb != nullptr))
398  nrn_pragma_omp(target update from(nrb->_cnt,
399  nrb->_size,
400  nrb->_pnt_offset,
401  nrb->_displ_cnt,
402  nrb->_pnt_index[:nrb->_size],
403  nrb->_weight_index[:nrb->_size],
404  nrb->_displ[:nrb->_size + 1],
405  nrb->_nrb_index[:nrb->_size])
406  if (nrb != nullptr))
407  // clang-format on
408 }
409 
410 static void delete_ml_from_device(Memb_list* ml, int type) {
411  int is_art = corenrn.get_is_artificial()[type];
412  if (is_art) {
413  return;
414  }
415  // Cleanup the net send buffer if it exists
416  {
417  NetSendBuffer_t* nsb{ml->_net_send_buffer};
418  if (nsb) {
419  cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
420  cnrn_target_delete(nsb->_nsb_t, nsb->_size);
421  cnrn_target_delete(nsb->_weight_index, nsb->_size);
422  cnrn_target_delete(nsb->_pnt_index, nsb->_size);
423  cnrn_target_delete(nsb->_vdata_index, nsb->_size);
424  cnrn_target_delete(nsb->_sendtype, nsb->_size);
425  cnrn_target_delete(nsb);
426  }
427  }
428  // Cleanup the net receive buffer if it exists.
429  {
430  NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
431  if (nrb) {
432  cnrn_target_delete(nrb->_nrb_index, nrb->_size);
433  cnrn_target_delete(nrb->_displ, nrb->_size + 1);
434  cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
435  cnrn_target_delete(nrb->_nrb_t, nrb->_size);
436  cnrn_target_delete(nrb->_weight_index, nrb->_size);
437  cnrn_target_delete(nrb->_pnt_index, nrb->_size);
438  cnrn_target_delete(nrb);
439  }
440  }
441  int n = ml->nodecount;
442  int szdp = corenrn.get_prop_dparam_size()[type];
443  int ts = corenrn.get_memb_funcs()[type].thread_size_;
444  if (ts) {
445  cnrn_target_delete(ml->_thread, ts);
446  }
447  if (szdp) {
448  int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
449  cnrn_target_delete(ml->pdata, pcnt);
450  }
451  cnrn_target_delete(ml->nodeindices, n);
452 
453  if (ml->global_variables) {
454  assert(ml->global_variables_size);
455  cnrn_target_delete(static_cast<std::byte*>(ml->global_variables),
456  ml->global_variables_size);
457  }
458 
459  cnrn_target_delete(ml);
460 }
461 
462 #endif
463 
464 /* note: threads here are corresponding to global nrn_threads array */
465 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
466 #ifdef CORENEURON_ENABLE_GPU
467  // initialize NrnThreads for gpu execution
468  // empty thread or only artificial cells should be on cpu
469  for (int i = 0; i < nthreads; i++) {
470  NrnThread* nt = threads + i;
471  nt->compute_gpu = (nt->end > 0) ? 1 : 0;
472  nt->_dt = dt;
473  }
474 
476 
477 #ifdef CORENEURON_UNIFIED_MEMORY
478  for (int i = 0; i < nthreads; i++) {
479  NrnThread* nt = threads + i; // NrnThread on host
480 
481  if (nt->n_presyn) {
482  PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
483  }
484 
485  if (nt->n_vecplay) {
486  /* copy VecPlayContinuous instances */
487  /** just empty containers */
488  void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
489  // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
490  // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
491  // inside nt on cpu (due to unified memory).
492 
493  nrn_VecPlay_copyto_device(nt, d_vecplay);
494  }
495 
496  if (!nt->_permute && nt->end > 0) {
497  printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
498  }
499  }
500 
501 #else
502  /* -- copy NrnThread to device. this needs to be contigious vector because offset is used to
503  * find
504  * corresponding NrnThread using Point_process in NET_RECEIVE block
505  */
506  NrnThread* d_threads = cnrn_target_copyin(threads, nthreads);
507 
508  if (interleave_info == nullptr) {
509  printf("\n Warning: No permutation data? Required for linear algebra!");
510  }
511 
512  /* pointers for data struct on device, starting with d_ */
513 
514  for (int i = 0; i < nthreads; i++) {
515  NrnThread* nt = threads + i; // NrnThread on host
516  NrnThread* d_nt = d_threads + i; // NrnThread on device
517  if (!nt->compute_gpu) {
518  continue;
519  }
520  double* d__data; // nrn_threads->_data on device
521 
522  /* -- copy _data to device -- */
523 
524  /*copy all double data for thread */
525  d__data = cnrn_target_copyin(nt->_data, nt->_ndata);
526 
527 
528  /* Here is the example of using OpenACC data enter/exit
529  * Remember that we are not allowed to use nt->_data but we have to use:
530  * double *dtmp = nt->_data; // now use dtmp!
531  #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
532  #pragma acc wait(nt->stream_id)
533  */
534 
535  /*update d_nt._data to point to device copy */
536  cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data);
537 
538  /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
539  double* dptr;
540 
541  /* for padding, we have to recompute ne */
542  int ne = nrn_soa_padded_size(nt->end, 0);
543 
544  dptr = d__data + 0 * ne;
545  cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr));
546 
547  dptr = d__data + 1 * ne;
548  cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr));
549 
550  dptr = d__data + 2 * ne;
551  cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr));
552 
553  dptr = d__data + 3 * ne;
554  cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr));
555 
556  dptr = d__data + 4 * ne;
557  cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr));
558 
559  dptr = d__data + 5 * ne;
560  cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr));
561 
562  if (nt->_actual_diam) {
563  dptr = d__data + 6 * ne;
564  cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr));
565  }
566 
567  int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end);
568  cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index));
569 
570  /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
571  Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size());
572  cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list));
573 
574  /* -- copy NrnThreadMembList list ml to device -- */
575 
576  NrnThreadMembList* d_last_tml;
577 
578  bool first_tml = true;
579 
580  for (auto tml = nt->tml; tml; tml = tml->next) {
581  /*copy tml to device*/
582  /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
583  auto d_tml = cnrn_target_copyin(tml);
584 
585  /*first tml is pointed by nt */
586  if (first_tml) {
587  cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml);
588  first_tml = false;
589  } else {
590  /*rest of tml forms linked list */
591  cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml);
592  }
593 
594  // book keeping for linked-list
595  d_last_tml = d_tml;
596 
597  /* now for every tml, there is a ml. copy that and setup pointer */
598  Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
599  cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
600  /* setup nt._ml_list */
601  cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
602  }
603 
604  if (nt->shadow_rhs_cnt) {
605  double* d_shadow_ptr;
606 
608 
609  /* copy shadow_rhs to device and fix-up the pointer */
610  d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt);
611  cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr);
612 
613  /* copy shadow_d to device and fix-up the pointer */
614  d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt);
615  cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr);
616  }
617 
618  /* Fast membrane current calculation struct */
619  if (nt->nrn_fast_imem) {
620  NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem);
621  cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem);
622  {
623  double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
624  cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr);
625  }
626  {
627  double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
628  cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr);
629  }
630  }
631 
632  if (nt->n_pntproc) {
633  /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
634  */
636  cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr);
637  }
638 
639  if (nt->n_weight) {
640  /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
641  double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight);
642  cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights);
643  }
644 
645  if (nt->_nvdata) {
646  /* copy vdata which is setup in bbcore_read. This contains cuda allocated
647  * nrnran123_State * */
648  void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata);
649  cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata);
650  }
651 
652  if (nt->n_presyn) {
653  /* copy presyn vector used for spike exchange, note we have added new PreSynHelper due
654  * to issue
655  * while updating PreSyn objects which has virtual base class. May be this is issue due
656  * to
657  * VTable and alignment */
658  PreSynHelper* d_presyns_helper = cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
659  cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper);
660  PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
661  cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns);
662  }
663 
664  if (nt->_net_send_buffer_size) {
665  /* copy send_receive buffer */
666  int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer,
668  cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer);
669  }
670 
671  if (nt->n_vecplay) {
672  /* copy VecPlayContinuous instances */
673  /** just empty containers */
674  void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
675  cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay);
676 
677  nrn_VecPlay_copyto_device(nt, d_vecplay);
678  }
679 
680  if (nt->_permute) {
681  if (interleave_permute_type == 1) {
682  /* todo: not necessary to setup pointers, just copy it */
683  InterleaveInfo* info = interleave_info + i;
684  int* d_ptr = nullptr;
685  InterleaveInfo* d_info = cnrn_target_copyin(info);
686 
687  d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1);
688  cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
689 
690  d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell);
691  cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
692 
693  d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell);
694  cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
695 
696  d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell);
697  cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
698 
699  } else if (interleave_permute_type == 2) {
700  /* todo: not necessary to setup pointers, just copy it */
701  InterleaveInfo* info = interleave_info + i;
702  InterleaveInfo* d_info = cnrn_target_copyin(info);
703  int* d_ptr = nullptr;
704 
705  d_ptr = cnrn_target_copyin(info->stride, info->nstride);
706  cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
707 
708  d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1);
709  cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
710 
711  d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1);
712  cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
713 
714  d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1);
715  cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr);
716 
717  d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp);
718  cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
719  } else {
720  printf("\n ERROR: only --cell_permute = [12] implemented");
721  abort();
722  }
723  } else {
724  printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
725  }
726 
727  {
729  if (tr) {
730  // Create a device-side copy of the `trajec_requests` struct and
731  // make sure the device-side NrnThread object knows about it.
732  TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr);
733  cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests);
734  // Initialise the double** gather member of the struct.
735  double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec);
736  cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather);
737  // Initialise the double** varrays member of the struct if it's
738  // set.
739  double** d_tr_varrays{nullptr};
740  if (tr->varrays) {
741  d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec);
742  cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays);
743  }
744  for (int i = 0; i < tr->n_trajec; ++i) {
745  if (tr->varrays) {
746  // tr->varrays[i] is a buffer of tr->bsize doubles on the host,
747  // make a device-side copy of it and store a pointer to it in
748  // the device-side version of tr->varrays.
749  double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize);
750  cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i);
751  }
752  // tr->gather[i] is a double* referring to (host) data in the
753  // (host) _data block
754  auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]);
755  cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i);
756  }
757  // TODO: other `double** scatter` and `void** vpr` members of
758  // the TrajectoryRequests struct are not copied to the device.
759  // The `int vsize` member is updated during the simulation but
760  // not kept up to date timestep-by-timestep on the device.
761  }
762  }
763  {
764  auto* d_fornetcon_perm_indices = cnrn_target_copyin(nt->_fornetcon_perm_indices,
767  &d_fornetcon_perm_indices);
768  }
769  {
770  auto* d_fornetcon_weight_perm = cnrn_target_copyin(nt->_fornetcon_weight_perm,
772  cnrn_target_memcpy_to_device(&(d_nt->_fornetcon_weight_perm), &d_fornetcon_weight_perm);
773  }
774  }
775 
776 #endif
777 #else
778  (void) threads;
779  (void) nthreads;
780 #endif
781 }
782 
784 #ifdef CORENEURON_ENABLE_GPU
785  /// by default `to` is desitionation pointer on a device
786  IvocVect* d_iv = &to;
787 
788  size_t n = from.size();
789  if (n) {
790  double* d_data = cnrn_target_copyin(from.data(), n);
791  cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data);
792  }
793 #else
794  (void) from;
795  (void) to;
796 #endif
797 }
798 
800 #ifdef CORENEURON_ENABLE_GPU
801  auto const n = vec.size();
802  if (n) {
803  cnrn_target_delete(vec.data(), n);
804  }
805 #else
806  static_cast<void>(vec);
807 #endif
808 }
809 
812  if (!nrb) {
813  return;
814  }
815 
816 #ifdef CORENEURON_ENABLE_GPU
817  if (nt->compute_gpu) {
818  // free existing vectors in buffers on gpu
819  cnrn_target_delete(nrb->_pnt_index, nrb->_size);
821  cnrn_target_delete(nrb->_nrb_t, nrb->_size);
822  cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
823  cnrn_target_delete(nrb->_displ, nrb->_size + 1);
824  cnrn_target_delete(nrb->_nrb_index, nrb->_size);
825  }
826 #endif
827  // Reallocate host buffers using ecalloc_align (as in phase2.cpp) and
828  // free_memory (as in nrn_setup.cpp)
829  auto const realloc = [old_size = nrb->_size, nrb](auto*& ptr, std::size_t extra_size = 0) {
830  using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
831  static_assert(std::is_trivial<T>::value,
832  "Only trivially constructible and copiable types are supported.");
833  static_assert(std::is_same<decltype(ptr), T*&>::value,
834  "ptr should be reference-to-pointer");
835  auto* const new_data = static_cast<T*>(ecalloc_align((nrb->_size + extra_size), sizeof(T)));
836  std::memcpy(new_data, ptr, (old_size + extra_size) * sizeof(T));
837  free_memory(ptr);
838  ptr = new_data;
839  };
840  nrb->_size *= 2;
841  realloc(nrb->_pnt_index);
842  realloc(nrb->_weight_index);
843  realloc(nrb->_nrb_t);
844  realloc(nrb->_nrb_flag);
845  realloc(nrb->_displ, 1);
846  realloc(nrb->_nrb_index);
847 #ifdef CORENEURON_ENABLE_GPU
848  if (nt->compute_gpu) {
849  // update device copy
850  nrn_pragma_acc(update device(nrb));
851  nrn_pragma_omp(target update to(nrb));
852 
853  NetReceiveBuffer_t* const d_nrb{cnrn_target_deviceptr(nrb)};
854  // recopy the vectors in the buffer
855  int* const d_pnt_index{cnrn_target_copyin(nrb->_pnt_index, nrb->_size)};
856  cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
857 
858  int* const d_weight_index{cnrn_target_copyin(nrb->_weight_index, nrb->_size)};
859  cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
860 
861  double* const d_nrb_t{cnrn_target_copyin(nrb->_nrb_t, nrb->_size)};
862  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
863 
864  double* const d_nrb_flag{cnrn_target_copyin(nrb->_nrb_flag, nrb->_size)};
865  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
866 
867  int* const d_displ{cnrn_target_copyin(nrb->_displ, nrb->_size + 1)};
868  cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
869 
870  int* const d_nrb_index{cnrn_target_copyin(nrb->_nrb_index, nrb->_size)};
871  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
872  }
873 #endif
874 }
875 
876 using NRB_P = std::pair<int, int>;
877 
878 struct comp {
879  bool operator()(const NRB_P& a, const NRB_P& b) {
880  if (a.first == b.first) {
881  return a.second > b.second; // same instances in original net_receive order
882  }
883  return a.first > b.first;
884  }
885 };
886 
888  Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-order");
889  if (nrb->_cnt == 0) {
890  nrb->_displ_cnt = 0;
891  return;
892  }
893 
894  std::priority_queue<NRB_P, std::vector<NRB_P>, comp> nrbq;
895 
896  for (int i = 0; i < nrb->_cnt; ++i) {
897  nrbq.push(NRB_P(nrb->_pnt_index[i], i));
898  }
899 
900  int displ_cnt = 0;
901  int index_cnt = 0;
902  int last_instance_index = -1;
903  nrb->_displ[0] = 0;
904 
905  while (!nrbq.empty()) {
906  const NRB_P& p = nrbq.top();
907  nrb->_nrb_index[index_cnt++] = p.second;
908  if (p.first != last_instance_index) {
909  ++displ_cnt;
910  }
911  nrb->_displ[displ_cnt] = index_cnt;
912  last_instance_index = p.first;
913  nrbq.pop();
914  }
915  nrb->_displ_cnt = displ_cnt;
916 }
917 
918 /* when we execute NET_RECEIVE block on GPU, we provide the index of synapse instances
919  * which we need to execute during the current timestep. In order to do this, we have
920  * update NetReceiveBuffer_t object to GPU. When size of cpu buffer changes, we set
921  * reallocated to true and hence need to reallocate buffer on GPU and then need to copy
922  * entire buffer. If reallocated is 0, that means buffer size is not changed and hence
923  * only need to copy _size elements to GPU.
924  * Note: this is very preliminary implementation, optimisations will be done after first
925  * functional version.
926  */
928  Instrumentor::phase p_update_net_receive_buffer("update-net-receive-buf");
929  for (auto tml = nt->tml; tml; tml = tml->next) {
930  int is_art = corenrn.get_is_artificial()[tml->index];
931  if (is_art) {
932  continue;
933  }
934  // net_receive buffer to copy
935  NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer;
936 
937  // if net receive buffer exist for mechanism
938  if (nrb && nrb->_cnt) {
939  // instance order to avoid race. setup _displ and _nrb_index
941 
942  if (nt->compute_gpu) {
943  Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
944  // note that dont update nrb otherwise we lose pointers
945 
946  // clang-format off
947 
948  /* update scalar elements */
949  nrn_pragma_acc(update device(nrb->_cnt,
950  nrb->_displ_cnt,
951  nrb->_pnt_index[:nrb->_cnt],
952  nrb->_weight_index[:nrb->_cnt],
953  nrb->_nrb_t[:nrb->_cnt],
954  nrb->_nrb_flag[:nrb->_cnt],
955  nrb->_displ[:nrb->_displ_cnt + 1],
956  nrb->_nrb_index[:nrb->_cnt])
957  async(nt->stream_id))
958  nrn_pragma_omp(target update to(nrb->_cnt,
959  nrb->_displ_cnt,
960  nrb->_pnt_index[:nrb->_cnt],
961  nrb->_weight_index[:nrb->_cnt],
962  nrb->_nrb_t[:nrb->_cnt],
963  nrb->_nrb_flag[:nrb->_cnt],
964  nrb->_displ[:nrb->_displ_cnt + 1],
965  nrb->_nrb_index[:nrb->_cnt]))
966  // clang-format on
967  }
968  }
969  }
970  nrn_pragma_acc(wait(nt->stream_id))
971 }
972 
974 #ifdef CORENEURON_ENABLE_GPU
975  if (!nt->compute_gpu)
976  return;
977 
978  // check if nsb->_cnt was exceeded on GPU: as the buffer can not be increased
979  // during gpu execution, we should just abort the execution.
980  // \todo: this needs to be fixed with different memory allocation strategy
981  if (nsb->_cnt > nsb->_size) {
982  printf("ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n", nrnmpi_myid);
983  nrn_abort(1);
984  }
985 
986  if (nsb->_cnt) {
987  Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu");
988  }
989  // clang-format off
990  nrn_pragma_acc(update self(nsb->_sendtype[:nsb->_cnt],
991  nsb->_vdata_index[:nsb->_cnt],
992  nsb->_pnt_index[:nsb->_cnt],
993  nsb->_weight_index[:nsb->_cnt],
994  nsb->_nsb_t[:nsb->_cnt],
995  nsb->_nsb_flag[:nsb->_cnt])
996  if (nsb->_cnt))
997  nrn_pragma_omp(target update from(nsb->_sendtype[:nsb->_cnt],
998  nsb->_vdata_index[:nsb->_cnt],
999  nsb->_pnt_index[:nsb->_cnt],
1000  nsb->_weight_index[:nsb->_cnt],
1001  nsb->_nsb_t[:nsb->_cnt],
1002  nsb->_nsb_flag[:nsb->_cnt])
1003  if (nsb->_cnt))
1004  // clang-format on
1005 #else
1006  (void) nt;
1007  (void) nsb;
1008 #endif
1009 }
1010 
1011 void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
1012 #ifdef CORENEURON_ENABLE_GPU
1013 
1014  for (int i = 0; i < nthreads; i++) {
1015  NrnThread* nt = threads + i;
1016 
1017  if (nt->compute_gpu && (nt->end > 0)) {
1018  /* -- copy data to host -- */
1019 
1020  int ne = nrn_soa_padded_size(nt->end, 0);
1021 
1022  // clang-format off
1023  nrn_pragma_acc(update self(nt->_actual_rhs[:ne],
1024  nt->_actual_d[:ne],
1025  nt->_actual_a[:ne],
1026  nt->_actual_b[:ne],
1027  nt->_actual_v[:ne],
1028  nt->_actual_area[:ne]))
1029  nrn_pragma_omp(target update from(nt->_actual_rhs[:ne],
1030  nt->_actual_d[:ne],
1031  nt->_actual_a[:ne],
1032  nt->_actual_b[:ne],
1033  nt->_actual_v[:ne],
1034  nt->_actual_area[:ne]))
1035  // clang-format on
1036 
1037  nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
1039  target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
1040 
1041  /* @todo: nt._ml_list[tml->index] = tml->ml; */
1042 
1043  /* -- copy NrnThreadMembList list ml to host -- */
1044  for (auto tml = nt->tml; tml; tml = tml->next) {
1045  if (!corenrn.get_is_artificial()[tml->index]) {
1046  nrn_pragma_acc(update self(tml->index, tml->ml->nodecount))
1047  nrn_pragma_omp(target update from(tml->index, tml->ml->nodecount))
1048  }
1049  update_ml_on_host(tml->ml, tml->index);
1050  }
1051 
1052  int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
1053  /* copy shadow_rhs to host */
1054  /* copy shadow_d to host */
1056  update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
1057  nrn_pragma_omp(target update from(
1058  nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
1059 
1060  // clang-format off
1062  nt->nrn_fast_imem->nrn_sav_d[:nt->end])
1063  if (nt->nrn_fast_imem != nullptr))
1064  nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
1065  nt->nrn_fast_imem->nrn_sav_d[:nt->end])
1066  if (nt->nrn_fast_imem != nullptr))
1067  // clang-format on
1068 
1069  nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
1070  nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
1071 
1072  nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if (nt->n_weight))
1073  nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight))
1074 
1075  nrn_pragma_acc(update self(
1076  nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
1077  nrn_pragma_omp(target update from(
1078  nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
1079 
1080  {
1082  if (tr && tr->varrays) {
1083  // The full buffers have `bsize` entries, but only `vsize`
1084  // of them are valid.
1085  for (int i = 0; i < tr->n_trajec; ++i) {
1086  nrn_pragma_acc(update self(tr->varrays[i][:tr->vsize]))
1087  nrn_pragma_omp(target update from(tr->varrays[i][:tr->vsize]))
1088  }
1089  }
1090  }
1091 
1092  /* dont update vdata, its pointer array
1093  nrn_pragma_acc(update self(nt->_vdata[:nt->_nvdata) if nt->_nvdata)
1094  nrn_pragma_omp(target update from(nt->_vdata[:nt->_nvdata) if (nt->_nvdata))
1095  */
1096  }
1097  }
1098 #else
1099  (void) threads;
1100  (void) nthreads;
1101 #endif
1102 }
1103 
1104 /**
1105  * Copy weights from GPU to CPU
1106  *
1107  * User may record NetCon weights at the end of simulation.
1108  * For this purpose update weights of all NrnThread objects
1109  * from GPU to CPU.
1110  */
1111 void update_weights_from_gpu(NrnThread* threads, int nthreads) {
1112 #ifdef CORENEURON_ENABLE_GPU
1113  for (int i = 0; i < nthreads; i++) {
1114  NrnThread* nt = threads + i;
1115  size_t n_weight = nt->n_weight;
1116  if (nt->compute_gpu && n_weight > 0) {
1117  double* weights = nt->weights;
1118  nrn_pragma_acc(update host(weights [0:n_weight]))
1119  nrn_pragma_omp(target update from(weights [0:n_weight]))
1120  }
1121  }
1122 #endif
1123 }
1124 
1125 /** Cleanup device memory that is being tracked by the OpenACC runtime.
1126  *
1127  * This function painstakingly calls `cnrn_target_delete` in reverse order on all
1128  * pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`.
1129  * This cleanup ensures that if the GPU is initialised multiple times from the
1130  * same process then the OpenACC runtime will not be polluted with old
1131  * pointers, which can cause errors. In particular if we do:
1132  * @code
1133  * {
1134  * // ... some_ptr is dynamically allocated ...
1135  * cnrn_target_copyin(some_ptr, some_size);
1136  * // ... do some work ...
1137  * // cnrn_target_delete(some_ptr);
1138  * free(some_ptr);
1139  * }
1140  * {
1141  * // ... same_ptr_again is dynamically allocated at the same address ...
1142  * cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR
1143  * }
1144  * @endcode
1145  * the application will/may abort with an error such as:
1146  * FATAL ERROR: variable in data clause is partially present on the device.
1147  * The pattern above is typical of calling CoreNEURON on GPU multiple times in
1148  * the same process.
1149  */
1150 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
1151 #ifdef CORENEURON_ENABLE_GPU
1152  for (int i = 0; i < nthreads; i++) {
1153  NrnThread* nt = threads + i;
1154  if (!nt->compute_gpu) {
1155  continue;
1156  }
1159  {
1161  if (tr) {
1162  if (tr->varrays) {
1163  for (int i = 0; i < tr->n_trajec; ++i) {
1164  cnrn_target_delete(tr->varrays[i], tr->bsize);
1165  }
1167  }
1169  cnrn_target_delete(tr);
1170  }
1171  }
1172  if (nt->_permute) {
1173  if (interleave_permute_type == 1) {
1174  InterleaveInfo* info = interleave_info + i;
1175  cnrn_target_delete(info->cellsize, nt->ncell);
1176  cnrn_target_delete(info->lastnode, nt->ncell);
1177  cnrn_target_delete(info->firstnode, nt->ncell);
1178  cnrn_target_delete(info->stride, info->nstride + 1);
1180  } else if (interleave_permute_type == 2) {
1181  InterleaveInfo* info = interleave_info + i;
1182  cnrn_target_delete(info->cellsize, info->nwarp);
1183  cnrn_target_delete(info->stridedispl, info->nwarp + 1);
1184  cnrn_target_delete(info->lastnode, info->nwarp + 1);
1185  cnrn_target_delete(info->firstnode, info->nwarp + 1);
1186  cnrn_target_delete(info->stride, info->nstride);
1188  }
1189  }
1190 
1191  if (nt->n_vecplay) {
1194  }
1195 
1196  // Cleanup send_receive buffer.
1197  if (nt->_net_send_buffer_size) {
1199  }
1200 
1201  if (nt->n_presyn) {
1204  }
1205 
1206  // Cleanup data that's setup in bbcore_read.
1207  if (nt->_nvdata) {
1208  cnrn_target_delete(nt->_vdata, nt->_nvdata);
1209  }
1210 
1211  // Cleanup weight vector used in NET_RECEIVE
1212  if (nt->n_weight) {
1214  }
1215 
1216  // Cleanup point processes
1217  if (nt->n_pntproc) {
1219  }
1220 
1221  if (nt->nrn_fast_imem) {
1225  }
1226 
1227  if (nt->shadow_rhs_cnt) {
1228  int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
1231  }
1232 
1233  for (auto tml = nt->tml; tml; tml = tml->next) {
1234  delete_ml_from_device(tml->ml, tml->index);
1235  cnrn_target_delete(tml);
1236  }
1239  cnrn_target_delete(nt->_data, nt->_ndata);
1240  }
1241  cnrn_target_delete(threads, nthreads);
1243 #endif
1244 }
1245 
1246 
1248 #ifdef CORENEURON_ENABLE_GPU
1249  // FIXME this check needs to be tweaked if we ever want to run with a mix
1250  // of CPU and GPU threads.
1251  if (nrn_threads[0].compute_gpu == 0) {
1252  return;
1253  }
1254 
1255  int n = ns->n * ns->n_instance;
1256  // actually, the values of double do not matter, only the pointers.
1257  NewtonSpace* d_ns = cnrn_target_copyin(ns);
1258 
1259  double* pd;
1260 
1261  pd = cnrn_target_copyin(ns->delta_x, n);
1262  cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd);
1263 
1264  pd = cnrn_target_copyin(ns->high_value, n);
1266 
1267  pd = cnrn_target_copyin(ns->low_value, n);
1268  cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd);
1269 
1270  pd = cnrn_target_copyin(ns->rowmax, n);
1271  cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd);
1272 
1273  auto pint = cnrn_target_copyin(ns->perm, n);
1275 
1276  auto ppd = cnrn_target_copyin(ns->jacobian, ns->n);
1277  cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd);
1278 
1279  // the actual jacobian doubles were allocated as a single array
1280  double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n);
1281 
1282  for (int i = 0; i < ns->n; ++i) {
1283  pd = d_jacdat + i * n;
1284  cnrn_target_memcpy_to_device(&(ppd[i]), &pd);
1285  }
1286 #endif
1287 }
1288 
1290 #ifdef CORENEURON_ENABLE_GPU
1291  // FIXME this check needs to be tweaked if we ever want to run with a mix
1292  // of CPU and GPU threads.
1293  if (nrn_threads[0].compute_gpu == 0) {
1294  return;
1295  }
1296  int n = ns->n * ns->n_instance;
1297  cnrn_target_delete(ns->jacobian[0], ns->n * n);
1298  cnrn_target_delete(ns->jacobian, ns->n);
1299  cnrn_target_delete(ns->perm, n);
1300  cnrn_target_delete(ns->rowmax, n);
1304  cnrn_target_delete(ns);
1305 #endif
1306 }
1307 
1309 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1310  // FIXME this check needs to be tweaked if we ever want to run with a mix
1311  // of CPU and GPU threads.
1312  if (nrn_threads[0].compute_gpu == 0) {
1313  return;
1314  }
1315 
1316  unsigned n1 = so->neqn + 1;
1317  SparseObj* d_so = cnrn_target_copyin(so);
1318  // only pointer fields in SparseObj that need setting up are
1319  // rowst, diag, rhs, ngetcall, coef_list
1320  // only pointer fields in Elm that need setting up are
1321  // r_down, c_right, value
1322  // do not care about the Elm* ptr value, just the space.
1323 
1324  Elm** d_rowst = cnrn_target_copyin(so->rowst, n1);
1325  cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst);
1326 
1327  Elm** d_diag = cnrn_target_copyin(so->diag, n1);
1328  cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag);
1329 
1330  unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded);
1332 
1333  double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded);
1334  cnrn_target_memcpy_to_device(&(d_so->rhs), &pd);
1335 
1336  double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size);
1337  cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list);
1338 
1339  // Fill in relevant Elm pointer values
1340 
1341  for (unsigned irow = 1; irow < n1; ++irow) {
1342  for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
1343  Elm* pelm = cnrn_target_copyin(elm);
1344 
1345  if (elm == so->rowst[irow]) {
1346  cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm);
1347  } else {
1349  cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e);
1350  }
1351 
1352  if (elm->col == elm->row) {
1353  cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm);
1354  }
1355 
1356  if (irow > 1) {
1357  if (elm->r_up) {
1358  Elm* d_e = cnrn_target_deviceptr(elm->r_up);
1359  cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e);
1360  }
1361  }
1362 
1364  cnrn_target_memcpy_to_device(&(pelm->value), &pd);
1365  }
1366  }
1367 
1368  // visit all the Elm again and fill in pelm->r_down and pelm->c_left
1369  for (unsigned irow = 1; irow < n1; ++irow) {
1370  for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
1371  auto pelm = cnrn_target_deviceptr(elm);
1372  if (elm->r_down) {
1373  auto d_e = cnrn_target_deviceptr(elm->r_down);
1374  cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e);
1375  }
1376  if (elm->c_right) {
1377  auto d_e = cnrn_target_deviceptr(elm->c_right);
1378  cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e);
1379  }
1380  }
1381  }
1382 
1383  // Fill in the d_so->coef_list
1384  for (unsigned i = 0; i < so->coef_list_size; ++i) {
1385  pd = cnrn_target_deviceptr(so->coef_list[i]);
1386  cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd);
1387  }
1388 #endif
1389 }
1390 
1392 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1393  // FIXME this check needs to be tweaked if we ever want to run with a mix
1394  // of CPU and GPU threads.
1395  if (nrn_threads[0].compute_gpu == 0) {
1396  return;
1397  }
1398  unsigned n1 = so->neqn + 1;
1399  for (unsigned irow = 1; irow < n1; ++irow) {
1400  for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
1403  }
1404  }
1406  cnrn_target_delete(so->rhs, n1 * so->_cntml_padded);
1408  cnrn_target_delete(so->diag, n1);
1409  cnrn_target_delete(so->rowst, n1);
1410  cnrn_target_delete(so);
1411 #endif
1412 }
1413 
1414 #ifdef CORENEURON_ENABLE_GPU
1415 
1419  for (int j = 0; j < nrn_ion_global_map_size; j++) {
1420  if (nrn_ion_global_map[j]) {
1421  double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j],
1423  cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap);
1424  }
1425  }
1426  }
1427 }
1428 
1430  for (int j = 0; j < nrn_ion_global_map_size; j++) {
1431  if (nrn_ion_global_map[j]) {
1433  }
1434  }
1437  }
1438 }
1439 
1440 void init_gpu() {
1441  // check how many gpu devices available per node
1442  int num_devices_per_node = cnrn_target_get_num_devices();
1443 
1444  // if no gpu found, can't run on GPU
1445  if (num_devices_per_node == 0) {
1446  nrn_fatal_error("\n ERROR : Enabled GPU execution but couldn't find NVIDIA GPU!\n");
1447  }
1448 
1449  if (corenrn_param.num_gpus != 0) {
1450  if (corenrn_param.num_gpus > num_devices_per_node) {
1451  nrn_fatal_error("Fatal error: asking for '%d' GPUs per node but only '%d' available\n",
1453  num_devices_per_node);
1454  } else {
1455  num_devices_per_node = corenrn_param.num_gpus;
1456  }
1457  }
1458 
1459  // get local rank within a node and assign specific gpu gpu for this node.
1460  // multiple threads within the node will use same device.
1461  int local_rank = 0;
1462  int local_size = 1;
1463 #if NRNMPI
1464  if (corenrn_param.mpi_enable) {
1465  local_rank = nrnmpi_local_rank();
1466  local_size = nrnmpi_local_size();
1467  }
1468 #endif
1469 
1470  cnrn_target_set_default_device(local_rank % num_devices_per_node);
1471 
1472  if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
1473  std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
1474  << " ranks per node\n";
1475  }
1476 }
1477 
1478 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
1479  for (int i = 0; i < nt->n_vecplay; i++) {
1480  VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
1481 
1482  /** just VecPlayContinuous object */
1483  VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance);
1484  cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance);
1485 
1486  /** copy y_, t_ and discon_indices_ */
1487  copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
1488  copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
1489  // OL211213: beware, the test suite does not currently include anything
1490  // with a non-null discon_indices_.
1491  if (vecplay_instance->discon_indices_) {
1492  IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_);
1493  cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices);
1494  copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_),
1495  *(d_vecplay_instance->discon_indices_));
1496  }
1497 
1498  /** copy PlayRecordEvent : todo: verify this */
1499  PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_);
1500 
1501  cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance));
1502  cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_);
1503 
1504  /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
1505  * already on GPU */
1506  double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_);
1507  cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_);
1508  }
1509 }
1510 
1512  for (int i = 0; i < nt->n_vecplay; i++) {
1513  auto* vecplay_instance = static_cast<VecPlayContinuous*>(nt->_vecplay[i]);
1514  cnrn_target_delete(vecplay_instance->e_);
1515  if (vecplay_instance->discon_indices_) {
1516  delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
1517  }
1518  delete_ivoc_vect_from_device(vecplay_instance->t_);
1519  delete_ivoc_vect_from_device(vecplay_instance->y_);
1520  cnrn_target_delete(vecplay_instance);
1521  }
1522 }
1523 
1524 #endif
1525 } // namespace coreneuron
int cxx_demangle(const char *symbol, char **funcname, size_t *funcname_sz)
PlayRecord * plr_
Definition: vrecitem.h:38
neuron::container::data_handle< double > pd_
Definition: vrecitem.h:90
PlayRecordEvent * e_
Definition: vrecitem.h:316
IvocVect * discon_indices_
Definition: vrecitem.h:311
IvocVect * y_
Definition: vrecitem.h:309
IvocVect * t_
Definition: vrecitem.h:310
#define weights
Definition: md1redef.h:42
#define i
Definition: md1redef.h:19
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
Definition: nrnran123.h:66
#define SOA_LAYOUT
Definition: data_layout.hpp:11
#define assert(ex)
Definition: hocassrt.h:24
int * pint
Definition: hocmodl.h:9
static char * env[]
Definition: inithoc.cpp:259
void free_memory(void *pointer)
Definition: memory.h:213
printf
Definition: extdef.h:5
Item * prev(Item *item)
Definition: list.cpp:94
void move(Item *q1, Item *q2, Item *q3)
Definition: list.cpp:200
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
NrnThread * nrn_threads
Definition: multicore.cpp:56
void cnrn_target_is_present_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
void cnrn_target_memcpy_to_device(std::string_view file, int line, T *d_ptr, const T *h_ptr, std::size_t len=1)
Definition: offload.hpp:150
void cnrn_target_copyin_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
void nrn_sparseobj_copyto_device(SparseObj *so)
void cnrn_target_deviceptr_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
void nrn_newtonspace_delete_from_device(NewtonSpace *ns)
void nrn_abort(int errcode)
Definition: utils.cpp:13
void * ecalloc_align(size_t n, size_t size, size_t alignment)
double ** nrn_ion_global_map
void nrn_VecPlay_delete_from_device(NrnThread *nt)
void nrn_ion_global_map_copyto_device()
int cnrn_target_get_num_devices()
InterleaveInfo * interleave_info
void copy_ivoc_vect_to_device(const IvocVect &from, IvocVect &to)
void cnrn_target_delete(std::string_view file, int line, T *h_ptr, std::size_t len=1)
Definition: offload.hpp:132
void update_nrnthreads_on_host(NrnThread *threads, int nthreads)
void update(NrnThread *_nt)
T * cnrn_target_copyin(std::string_view file, int line, const T *h_ptr, std::size_t len=1)
Definition: offload.hpp:110
static void net_receive_buffer_order(NetReceiveBuffer_t *nrb)
static void nrn_fatal_error(const char *msg)
Definition: nrnmpi.cpp:31
const int ion_global_map_member_size
int interleave_permute_type
std::pair< int, int > NRB_P
void nrn_newtonspace_copyto_device(NewtonSpace *ns)
CoreNeuron corenrn
Definition: multicore.cpp:53
nrn_pragma_acc(routine seq) int vector_capacity(void *v)
Definition: ivocvect.cpp:30
void cnrn_target_set_default_device(int device_num)
void update_net_receive_buffer(NrnThread *nt)
void delete_nrnthreads_on_device(NrnThread *threads, int nthreads)
Cleanup device memory that is being tracked by the OpenACC runtime.
void init_gpu()
void nrn_VecPlay_copyto_device(NrnThread *nt, void **d_vecplay)
void setup_nrnthreads_on_device(NrnThread *threads, int nthreads)
void nrn_ion_global_map_delete_from_device()
void update_net_send_buffer_on_host(NrnThread *nt, NetSendBuffer_t *nsb)
int nrn_soa_padded_size(int cnt, int layout)
calculate size after padding for specific memory layout
void nrn_sparseobj_delete_from_device(SparseObj *so)
void realloc_net_receive_buffer(NrnThread *nt, Memb_list *ml)
void cnrn_target_delete_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len)
int nrn_ion_global_map_size
void cnrn_target_memcpy_to_device_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
corenrn_parameters corenrn_param
Printing method.
void update_weights_from_gpu(NrnThread *threads, int nthreads)
Copy weights from GPU to CPU.
void delete_ivoc_vect_from_device(IvocVect &vec)
if(ncell==0)
Definition: cellorder.cpp:785
static List * info
int const size_t const size_t n
Definition: nrngsl.h:10
return status
size_t p
size_t j
short type
Definition: cabvars.h:10
static int pcnt
Definition: axis.cpp:425
#define cnrn_target_deviceptr(...)
Definition: offload.hpp:188
static uint32_t value
Definition: scoprand.cpp:25
int pu(int, int, int)
Definition: units.cpp:645
A view into a set of mechanism instances.
Definition: nrnoc_ml.h:34
int nodecount
Definition: nrnoc_ml.h:78
int * nodeindices
Definition: nrnoc_ml.h:74
Datum ** pdata
Definition: nrnoc_ml.h:75
std::vector< double * > data()
Get a vector of double* representing the model data.
Definition: memblist.cpp:64
Datum * _thread
Definition: nrnoc_ml.h:77
Represent main neuron object computed by single thread.
Definition: multicore.h:58
NetReceiveBuffer_t * _net_receive_buffer
Definition: mechanism.hpp:142
PreSynHelper * presyns_helper
Definition: multicore.hpp:84
NrnFastImem * nrn_fast_imem
Definition: multicore.hpp:124
size_t * _fornetcon_weight_perm
Definition: multicore.hpp:152
size_t * _fornetcon_perm_indices
Definition: multicore.hpp:150
Memb_list ** _ml_list
Definition: multicore.hpp:81
std::size_t _fornetcon_perm_indices_size
Definition: multicore.hpp:149
NrnThreadMembList * tml
Definition: multicore.hpp:80
std::size_t _fornetcon_weight_perm_size
Definition: multicore.hpp:151
TrajectoryRequests * trajec_requests
Definition: multicore.hpp:146
Point_process * pntprocs
Definition: multicore.hpp:82
NrnThreadMembList * next
Definition: multicore.hpp:33
bool operator()(const NRB_P &a, const NRB_P &b)
bool mpi_enable
Initialization seed for random number generator (int)
unsigned num_gpus
Number of warps to balance for cell_interleave_permute == 2.
Definition: lineq.h:17
double value
Definition: lineq.h:20
unsigned col
Definition: lineq.h:19
struct elm * c_right
Definition: lineq.h:24
struct elm * r_up
Definition: lineq.h:21
unsigned row
Definition: lineq.h:18
struct elm * r_down
Definition: lineq.h:22
struct elm * c_left
Definition: lineq.h:23