30 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
31 #include <cuda_runtime_api.h>
34 #if __has_include(<cxxabi.h>)
41 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
46 #include <shared_mutex>
48 struct present_table_value {
49 std::size_t ref_count{}, size{};
52 std::map<std::byte const*, present_table_value> present_table;
53 std::shared_mutex present_table_mutex;
65 std::unique_ptr<char, decltype(free)*> demangled{
66 abi::__cxa_demangle(mangled,
nullptr,
nullptr, &
status), free};
67 return status ? mangled : demangled.get();
72 bool cnrn_target_debug_output_enabled() {
73 const char*
env = std::getenv(
"CORENEURON_GPU_DEBUG");
77 std::string env_s{
env};
80 }
else if (env_s ==
"0") {
83 throw std::runtime_error(
"CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s +
")");
86 bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
99 std::type_info
const& typeid_T,
103 if (!cnrn_target_enable_debug) {
106 std::cerr << file <<
':' << line <<
": cnrn_target_copyin<" <<
cxx_demangle(typeid_T.name())
107 <<
">(" << h_ptr <<
", " << len <<
" * " << sizeof_T <<
" = " << len * sizeof_T
108 <<
") -> " << d_ptr << std::endl;
112 std::size_t sizeof_T,
113 std::type_info
const& typeid_T,
116 if (!cnrn_target_enable_debug) {
119 std::cerr << file <<
':' << line <<
": cnrn_target_delete<" <<
cxx_demangle(typeid_T.name())
120 <<
">(" << h_ptr <<
", " << len <<
" * " << sizeof_T <<
" = " << len * sizeof_T <<
')'
125 std::type_info
const& typeid_T,
128 if (!cnrn_target_enable_debug) {
131 std::cerr << file <<
':' << line <<
": cnrn_target_deviceptr<" <<
cxx_demangle(typeid_T.name())
132 <<
">(" << h_ptr <<
") -> " << d_ptr << std::endl;
136 std::type_info
const& typeid_T,
139 if (!cnrn_target_enable_debug) {
142 std::cerr << file <<
':' << line <<
": cnrn_target_is_present<" <<
cxx_demangle(typeid_T.name())
143 <<
">(" << h_ptr <<
") -> " << d_ptr << std::endl;
147 std::size_t sizeof_T,
148 std::type_info
const& typeid_T,
152 if (!cnrn_target_enable_debug) {
155 std::cerr << file <<
':' << line <<
": cnrn_target_memcpy_to_device<"
156 <<
cxx_demangle(typeid_T.name()) <<
">(" << d_ptr <<
", " << h_ptr <<
", " << len
157 <<
" * " << sizeof_T <<
" = " << len * sizeof_T <<
')' << std::endl;
160 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
161 std::pair<void*, bool> cnrn_target_deviceptr_impl(
bool must_be_present_or_null,
void const* h_ptr) {
163 return {
nullptr,
false};
167 std::shared_lock _{present_table_mutex};
168 if (present_table.empty()) {
169 return {
nullptr, must_be_present_or_null};
173 auto const iter =
std::prev(std::upper_bound(
174 present_table.begin(), present_table.end(), h_ptr, [](
void const* hp,
auto const& entry) {
175 return hp < entry.first;
177 if (iter == present_table.end()) {
178 return {
nullptr, must_be_present_or_null};
180 std::byte
const*
const h_byte_ptr{
static_cast<std::byte const*
>(h_ptr)};
181 std::byte
const*
const h_start_of_block{iter->first};
182 std::size_t
const block_size{iter->second.size};
183 std::byte*
const d_start_of_block{iter->second.dev_ptr};
184 bool const is_present{h_byte_ptr < h_start_of_block + block_size};
186 return {
nullptr, must_be_present_or_null};
188 return {d_start_of_block + (h_byte_ptr - h_start_of_block),
false};
191 void cnrn_target_copyin_update_present_table(
void const* h_ptr,
void* d_ptr, std::size_t len) {
196 std::lock_guard _{present_table_mutex};
198 present_table_value new_val{};
200 new_val.ref_count = 1;
201 new_val.dev_ptr =
static_cast<std::byte*
>(d_ptr);
202 auto const [iter, inserted] = present_table.emplace(
static_cast<std::byte const*
>(h_ptr),
206 assert(iter->second.size == len);
207 assert(iter->second.dev_ptr == new_val.dev_ptr);
208 ++(iter->second.ref_count);
211 void cnrn_target_delete_update_present_table(
void const* h_ptr, std::size_t len) {
215 std::lock_guard _{present_table_mutex};
216 auto const iter = present_table.find(
static_cast<std::byte const*
>(h_ptr));
217 assert(iter != present_table.end());
218 assert(iter->second.size == len);
219 --(iter->second.ref_count);
220 if (iter->second.ref_count == 0) {
221 present_table.erase(iter);
227 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
230 acc_device_t device_type = acc_device_nvidia;
232 return acc_get_num_devices(device_type);
233 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
235 return omp_get_num_devices();
237 throw std::runtime_error(
238 "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
243 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
245 acc_set_device_num(device_num, acc_device_nvidia);
246 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
248 omp_set_default_device(device_num);
253 auto const cuda_code = cudaSetDevice(device_num);
254 assert(cuda_code == cudaSuccess);
256 throw std::runtime_error(
257 "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
261 #ifdef CORENEURON_ENABLE_GPU
262 #ifndef CORENEURON_UNIFIED_MEMORY
272 if (ml->global_variables) {
273 assert(ml->global_variables_size);
275 ml->global_variables_size);
304 NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
331 NetSendBuffer_t* nsb = ml->_net_send_buffer;
334 NetSendBuffer_t* d_nsb;
364 static void update_ml_on_host(
const Memb_list* ml,
int type) {
386 auto nrb = ml->_net_receive_buffer;
393 nrb->_pnt_index[:nrb->_size],
394 nrb->_weight_index[:nrb->_size],
395 nrb->_displ[:nrb->_size + 1],
396 nrb->_nrb_index[:nrb->_size])
402 nrb->_pnt_index[:nrb->_size],
403 nrb->_weight_index[:nrb->_size],
404 nrb->_displ[:nrb->_size + 1],
405 nrb->_nrb_index[:nrb->_size])
417 NetSendBuffer_t* nsb{ml->_net_send_buffer};
430 NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
441 int n = ml->nodecount;
453 if (ml->global_variables) {
454 assert(ml->global_variables_size);
456 ml->global_variables_size);
466 #ifdef CORENEURON_ENABLE_GPU
469 for (
int i = 0;
i < nthreads;
i++) {
477 #ifdef CORENEURON_UNIFIED_MEMORY
478 for (
int i = 0;
i < nthreads;
i++) {
497 printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?",
i);
509 printf(
"\n Warning: No permutation data? Required for linear algebra!");
514 for (
int i = 0;
i < nthreads;
i++) {
544 dptr = d__data + 0 * ne;
547 dptr = d__data + 1 * ne;
550 dptr = d__data + 2 * ne;
553 dptr = d__data + 3 * ne;
556 dptr = d__data + 4 * ne;
559 dptr = d__data + 5 * ne;
563 dptr = d__data + 6 * ne;
578 bool first_tml =
true;
580 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
598 Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
605 double* d_shadow_ptr;
684 int* d_ptr =
nullptr;
703 int* d_ptr =
nullptr;
720 printf(
"\n ERROR: only --cell_permute = [12] implemented");
724 printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?",
i);
739 double** d_tr_varrays{
nullptr};
767 &d_fornetcon_perm_indices);
784 #ifdef CORENEURON_ENABLE_GPU
788 size_t n = from.size();
800 #ifdef CORENEURON_ENABLE_GPU
801 auto const n = vec.size();
806 static_cast<void>(vec);
816 #ifdef CORENEURON_ENABLE_GPU
829 auto const realloc = [old_size = nrb->
_size, nrb](
auto*& ptr, std::size_t extra_size = 0) {
830 using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
832 "Only trivially constructible and copiable types are supported.");
833 static_assert(std::is_same<decltype(ptr), T*&>::
value,
834 "ptr should be reference-to-pointer");
835 auto*
const new_data =
static_cast<T*
>(
ecalloc_align((nrb->
_size + extra_size),
sizeof(T)));
836 std::memcpy(new_data, ptr, (old_size + extra_size) *
sizeof(T));
847 #ifdef CORENEURON_ENABLE_GPU
880 if (a.first == b.first) {
881 return a.second > b.second;
883 return a.first > b.first;
889 if (nrb->
_cnt == 0) {
894 std::priority_queue<NRB_P, std::vector<NRB_P>,
comp> nrbq;
896 for (
int i = 0;
i < nrb->
_cnt; ++
i) {
902 int last_instance_index = -1;
905 while (!nrbq.empty()) {
906 const NRB_P&
p = nrbq.top();
908 if (
p.first != last_instance_index) {
911 nrb->
_displ[displ_cnt] = index_cnt;
912 last_instance_index =
p.first;
929 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
938 if (nrb && nrb->
_cnt) {
974 #ifdef CORENEURON_ENABLE_GPU
982 printf(
"ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n",
nrnmpi_myid);
1012 #ifdef CORENEURON_ENABLE_GPU
1014 for (
int i = 0;
i < nthreads;
i++) {
1044 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
1049 update_ml_on_host(tml->ml, tml->index);
1112 #ifdef CORENEURON_ENABLE_GPU
1113 for (
int i = 0;
i < nthreads;
i++) {
1151 #ifdef CORENEURON_ENABLE_GPU
1152 for (
int i = 0;
i < nthreads;
i++) {
1233 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
1234 delete_ml_from_device(tml->ml, tml->index);
1248 #ifdef CORENEURON_ENABLE_GPU
1282 for (
int i = 0;
i < ns->
n; ++
i) {
1283 pd = d_jacdat +
i *
n;
1290 #ifdef CORENEURON_ENABLE_GPU
1309 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1316 unsigned n1 = so->
neqn + 1;
1341 for (
unsigned irow = 1; irow < n1; ++irow) {
1369 for (
unsigned irow = 1; irow < n1; ++irow) {
1392 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1398 unsigned n1 = so->
neqn + 1;
1399 for (
unsigned irow = 1; irow < n1; ++irow) {
1414 #ifdef CORENEURON_ENABLE_GPU
1445 if (num_devices_per_node == 0) {
1446 nrn_fatal_error(
"\n ERROR : Enabled GPU execution but couldn't find NVIDIA GPU!\n");
1451 nrn_fatal_error(
"Fatal error: asking for '%d' GPUs per node but only '%d' available\n",
1453 num_devices_per_node);
1465 local_rank = nrnmpi_local_rank();
1466 local_size = nrnmpi_local_size();
1473 std::cout <<
" Info : " << num_devices_per_node <<
" GPUs shared by " << local_size
1474 <<
" ranks per node\n";
1479 for (
int i = 0;
i < nt->n_vecplay;
i++) {
1495 *(d_vecplay_instance->discon_indices_));
1512 for (
int i = 0;
i < nt->n_vecplay;
i++) {
int cxx_demangle(const char *symbol, char **funcname, size_t *funcname_sz)
neuron::container::data_handle< double > pd_
IvocVect * discon_indices_
auto & get_prop_dparam_size()
auto & get_is_artificial()
auto & get_prop_param_size()
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
void free_memory(void *pointer)
void move(Item *q1, Item *q2, Item *q3)
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
void cnrn_target_is_present_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
void cnrn_target_memcpy_to_device(std::string_view file, int line, T *d_ptr, const T *h_ptr, std::size_t len=1)
void cnrn_target_copyin_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
void nrn_sparseobj_copyto_device(SparseObj *so)
void cnrn_target_deviceptr_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
void nrn_newtonspace_delete_from_device(NewtonSpace *ns)
void nrn_abort(int errcode)
void * ecalloc_align(size_t n, size_t size, size_t alignment)
double ** nrn_ion_global_map
void nrn_VecPlay_delete_from_device(NrnThread *nt)
void nrn_ion_global_map_copyto_device()
int cnrn_target_get_num_devices()
InterleaveInfo * interleave_info
void copy_ivoc_vect_to_device(const IvocVect &from, IvocVect &to)
void cnrn_target_delete(std::string_view file, int line, T *h_ptr, std::size_t len=1)
void update_nrnthreads_on_host(NrnThread *threads, int nthreads)
void update(NrnThread *_nt)
T * cnrn_target_copyin(std::string_view file, int line, const T *h_ptr, std::size_t len=1)
static void net_receive_buffer_order(NetReceiveBuffer_t *nrb)
static void nrn_fatal_error(const char *msg)
const int ion_global_map_member_size
int interleave_permute_type
std::pair< int, int > NRB_P
void nrn_newtonspace_copyto_device(NewtonSpace *ns)
nrn_pragma_acc(routine seq) int vector_capacity(void *v)
void cnrn_target_set_default_device(int device_num)
void update_net_receive_buffer(NrnThread *nt)
void delete_nrnthreads_on_device(NrnThread *threads, int nthreads)
Cleanup device memory that is being tracked by the OpenACC runtime.
void nrn_VecPlay_copyto_device(NrnThread *nt, void **d_vecplay)
void setup_nrnthreads_on_device(NrnThread *threads, int nthreads)
void nrn_ion_global_map_delete_from_device()
void update_net_send_buffer_on_host(NrnThread *nt, NetSendBuffer_t *nsb)
int nrn_soa_padded_size(int cnt, int layout)
calculate size after padding for specific memory layout
void nrn_sparseobj_delete_from_device(SparseObj *so)
void realloc_net_receive_buffer(NrnThread *nt, Memb_list *ml)
void cnrn_target_delete_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len)
int nrn_ion_global_map_size
void cnrn_target_memcpy_to_device_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
corenrn_parameters corenrn_param
Printing method.
void update_weights_from_gpu(NrnThread *threads, int nthreads)
Copy weights from GPU to CPU.
void delete_ivoc_vect_from_device(IvocVect &vec)
int const size_t const size_t n
#define cnrn_target_deviceptr(...)
A view into a set of mechanism instances.
std::vector< double * > data()
Get a vector of double* representing the model data.
Represent main neuron object computed by single thread.
NetReceiveBuffer_t * _net_receive_buffer
int _net_send_buffer_size
PreSynHelper * presyns_helper
NrnFastImem * nrn_fast_imem
size_t * _fornetcon_weight_perm
size_t * _fornetcon_perm_indices
std::size_t _fornetcon_perm_indices_size
std::size_t _fornetcon_weight_perm_size
TrajectoryRequests * trajec_requests
bool operator()(const NRB_P &a, const NRB_P &b)
bool mpi_enable
Initialization seed for random number generator (int)
unsigned num_gpus
Number of warps to balance for cell_interleave_permute == 2.