BlueBrain · pramodk · Aug 20, 2021 · Mar 18, 2021 · Mar 19, 2021 · Apr 3, 2021
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
@@ -120,6 +120,46 @@ char* prepare_args(int& argc, char**& argv, int use_mpi, const char* arg) {
 namespace coreneuron {
 void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init);
 
+// bsize = 0 then per step transfer
+// bsize > 1 then full trajectory save into arrays.
+void get_nrn_trajectory_requests(int bsize) {
+    if (nrn2core_get_trajectory_requests_) {
+        for (int tid = 0; tid < nrn_nthread; ++tid) {
+            NrnThread& nt = nrn_threads[tid];
+            int n_pr;
+            int n_trajec;
+            int* types;
+            int* indices;
+            void** vpr;
+            double** varrays;
+            double** pvars;
+
+            // bsize is passed by reference, the return value will determine if
+            // per step return or entire trajectory return.
+            (*nrn2core_get_trajectory_requests_)(
+                tid, bsize, n_pr, vpr, n_trajec, types, indices, pvars, varrays);
+            delete_trajectory_requests(nt);
+            if (n_trajec) {
+                TrajectoryRequests* tr = new TrajectoryRequests;
+                nt.trajec_requests = tr;
+                tr->bsize = bsize;
+                tr->n_pr = n_pr;
+                tr->n_trajec = n_trajec;
+                tr->vsize = 0;
+                tr->vpr = vpr;
+                tr->gather = new double*[n_trajec];
+                tr->varrays = varrays;
+                tr->scatter = pvars;
+                for (int i = 0; i < n_trajec; ++i) {
+                    tr->gather[i] = stdindex2ptr(types[i], indices[i], nt);
+                }
+                delete[] types;
+                delete[] indices;
+            }
+        }
+    }
+}
+
 void nrn_init_and_load_data(int argc,
                             char* argv[],
                             CheckPoints& checkPoints,
@@ -167,7 +207,9 @@ void nrn_init_and_load_data(int argc,
     set_globals(corenrn_param.datpath.c_str(), (corenrn_param.seed >= 0), corenrn_param.seed);
 
     // set global variables for start time, timestep and temperature
-    t = checkPoints.restore_time();
+    if (!corenrn_embedded) {
+        t = checkPoints.restore_time();
+    }
 
     if (corenrn_param.dt != -1000.) {  // command line arg highest precedence
         dt = corenrn_param.dt;
@@ -267,10 +309,38 @@ void nrn_init_and_load_data(int argc,
         report_mem_usage("After mk_spikevec_buffer");
     }
 
+    // In direct mode there are likely trajectory record requests
+    // to allow processing in NEURON after simulation by CoreNEURON
+    if (corenrn_embedded) {
+        // arg is additional vector size required (how many items will be
+        // written to the double*) but NEURON can instead
+        // specify that returns will be on a per time step basis.
+        get_nrn_trajectory_requests(int((corenrn_param.tstop - t) / corenrn_param.dt) + 2);
+
+        // In direct mode, CoreNEURON has exactly the behavior of
+        // ParallelContext.psolve(tstop). Ie a sequence of such calls
+        // without an intervening h.finitialize() continues from the end
+        // of the previous call. I.e., all initial state, including
+        // the event queue has been set up in NEURON. And, at the end
+        // all final state, including the event queue will be sent back
+        // to NEURON. Here there is some first time only
+        // initialization and queue transfer.
+        direct_mode_initialize();
+        (*nrn2core_part2_clean_)();
+    }
+
     if (corenrn_param.gpu) {
+        // Copy nrnthreads to device only after all the data are passed from NEURON and the
+        // nrnthreads on CPU are properly set up
         setup_nrnthreads_on_device(nrn_threads, nrn_nthread);
     }
 
+    if (corenrn_embedded) {
+        // Run nrn_init of mechanisms only to allocate any extra data needed on the GPU after
+        // nrnthreads are properly set up on the GPU
+        allocate_data_in_mechanism_nrn_init();
+    }
+
     if (nrn_have_gaps) {
         nrn_partrans::gap_update_indices();
     }
@@ -332,53 +402,14 @@ std::string cnrn_version() {
     return version::to_string();
 }
 
-// bsize = 0 then per step transfer
-// bsize > 1 then full trajectory save into arrays.
-void get_nrn_trajectory_requests(int bsize) {
-    if (nrn2core_get_trajectory_requests_) {
-        for (int tid = 0; tid < nrn_nthread; ++tid) {
-            NrnThread& nt = nrn_threads[tid];
-            int n_pr;
-            int n_trajec;
-            int* types;
-            int* indices;
-            void** vpr;
-            double** varrays;
-            double** pvars;
-
-            // bsize is passed by reference, the return value will determine if
-            // per step return or entire trajectory return.
-            (*nrn2core_get_trajectory_requests_)(
-                tid, bsize, n_pr, vpr, n_trajec, types, indices, pvars, varrays);
-            delete_trajectory_requests(nt);
-            if (n_trajec) {
-                TrajectoryRequests* tr = new TrajectoryRequests;
-                nt.trajec_requests = tr;
-                tr->bsize = bsize;
-                tr->n_pr = n_pr;
-                tr->n_trajec = n_trajec;
-                tr->vsize = 0;
-                tr->vpr = vpr;
-                tr->gather = new double*[n_trajec];
-                tr->varrays = varrays;
-                tr->scatter = pvars;
-                for (int i = 0; i < n_trajec; ++i) {
-                    tr->gather[i] = stdindex2ptr(types[i], indices[i], nt);
-                }
-                delete[] types;
-                delete[] indices;
-            }
-        }
-    }
-}
 
 static void trajectory_return() {
     if (nrn2core_trajectory_return_) {
         for (int tid = 0; tid < nrn_nthread; ++tid) {
             NrnThread& nt = nrn_threads[tid];
             TrajectoryRequests* tr = nt.trajec_requests;
             if (tr && tr->varrays) {
-                (*nrn2core_trajectory_return_)(tid, tr->n_pr, tr->vsize, tr->vpr, nt._t);
+                (*nrn2core_trajectory_return_)(tid, tr->n_pr, tr->bsize, tr->vsize, tr->vpr, nt._t);
             }
         }
     }
@@ -499,19 +530,10 @@ extern "C" int run_solve_core(int argc, char** argv) {
             abort();
         }
 
-        // In direct mode there are likely trajectory record requests
-        // to allow processing in NEURON after simulation by CoreNEURON
-        if (corenrn_embedded) {
-            // arg is vector size required but NEURON can instead
-            // specify that returns will be on a per time step basis.
-            get_nrn_trajectory_requests(int(tstop / dt) + 2);
-            (*nrn2core_part2_clean_)();
-        }
-
         // TODO : if some ranks are empty then restore will go in deadlock
         // phase (as some ranks won't have restored anything and hence return
         // false in checkpoint_initialize
-        if (!checkPoints.initialize()) {
+        if (!corenrn_embedded && !checkPoints.initialize()) {
             nrn_finitialize(v != 1000., v);
         }
 

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -581,6 +581,14 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
     if (!nt->compute_gpu)
         return;
 
+    // check if nsb->_cnt was exceeded on GPU: as the buffer can not be increased
+    // during gpu execution, we should just abort the execution.
+    // \todo: this needs to be fixed with different memory allocation strategy
+    if (nsb->_cnt > nsb->_size) {
+        printf("ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n", nrnmpi_myid);
+        nrn_abort(1);
+    }
+
     if (nsb->_cnt) {
         acc_update_self(nsb->_sendtype, sizeof(int) * nsb->_cnt);
         acc_update_self(nsb->_vdata_index, sizeof(int) * nsb->_cnt);
@@ -685,6 +693,11 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
                 acc_update_self(nt->weights, sizeof(double) * nt->n_weight);
             }
 
+            if (nt->n_presyn) {
+                acc_update_self(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+                acc_update_self(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            }
+
             /* dont update vdata, its pointer array
                if(nt->_nvdata) {
                acc_update_self(nt->_vdata, sizeof(double)*nt->_nvdata);
@@ -779,6 +792,11 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                 acc_update_device(nt->weights, sizeof(double) * nt->n_weight);
             }
 
+            if (nt->n_presyn) {
+                acc_update_device(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+                acc_update_device(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            }
+
             /* don't and don't update vdata, its pointer array
                if(nt->_nvdata) {
                acc_update_device(nt->_vdata, sizeof(double)*nt->_nvdata);
@@ -792,37 +810,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #endif
 }
 
-/**
- * Copy voltage vector from GPU to CPU
- *
- * \todo Currently we are copying all voltage vector from GPU
- *       to CPU. We need fine-grain implementation to copy
- *       only requested portion of the voltage vector.
- */
-void update_voltage_from_gpu(NrnThread* nt) {
-    if (nt->compute_gpu && nt->end > 0) {
-        double* voltage = nt->_actual_v;
-        int num_voltage = nrn_soa_padded_size(nt->end, 0);
-        // clang-format off
-
-        #pragma acc update host(voltage [0:num_voltage])
-        // clang-format on
-    }
-}
-
-/**
- * @brief Copy fast_imem vectors from GPU to CPU.
- *
- */
-void update_fast_imem_from_gpu(NrnThread* nt) {
-    if (nt->compute_gpu && nt->end > 0 && nt->nrn_fast_imem) {
-        int num_fast_imem = nt->end;
-        double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
-        double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc update host(fast_imem_d [0:num_fast_imem], fast_imem_rhs [0:num_fast_imem])
-    }
-}
-
 /**
  * Copy weights from GPU to CPU
  *

diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -28,8 +28,6 @@ void update_matrix_to_gpu(NrnThread* _nt);
 void update_net_receive_buffer(NrnThread* _nt);
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
-void update_voltage_from_gpu(NrnThread* nt);
-void update_fast_imem_from_gpu(NrnThread* nt);
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();