Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
bd18040
Must be able to turn off nrn_use_fast_imem as well as on.
nrnhines Mar 18, 2021
864ac5a
Direct mode has same behavior for ParallelContext.psolve
nrnhines Mar 19, 2021
506cb62
Event Queue NEURON->CoreNEURON NetCon, Presyn, most SelfEvent
nrnhines Apr 3, 2021
ca308bc
Event queue transfer working at level of test_datareturn.py.
nrnhines Apr 4, 2021
1c625a6
queue transfer: not yet handling movable if true.
nrnhines Apr 5, 2021
5d1a8c8
nrn2core WatchCondition transfer. Needs much testing.
nrnhines Apr 10, 2021
5df1dfd
Merge branch 'master' into psolve-direct
nrnhines Apr 17, 2021
08a58b8
nrn2core VecPlayContinuous transfered and activated.
nrnhines Apr 18, 2021
566eddb
Update clang-format
pramodk Apr 18, 2021
1813283
Merge branch 'master' into psolve-direct
nrnhines Apr 25, 2021
be2a5a7
Framework for copying event queue back to NEURON.
nrnhines Apr 25, 2021
b9e3eb3
Merge branch 'master' into psolve-direct
nrnhines May 8, 2021
d9829de
Send back SelfEvents on event queue to NEURON.
nrnhines May 10, 2021
8c90009
handle weight_index=-1, remove some debug printf
nrnhines May 12, 2021
c9e4771
Fix SelfEvent transfer errors for movable and Point_process.
nrnhines May 14, 2021
4f8c6d3
PreSyn.flag_ must be part of direct transfer.
nrnhines May 16, 2021
1361e62
Merge branch 'master' into psolve-direct
nrnhines May 22, 2021
dd8b9c6
PreSyn.flag_ transfer finds node index with inverse permutation.
nrnhines May 23, 2021
a6367b0
psolve transfer corenrn -> nrn calls bbcore_read in nrn.
nrnhines May 24, 2021
cb6637a
core2nrn for WATCH statements.
nrnhines May 29, 2021
24c561a
make clang-format
nrnhines May 29, 2021
f88b895
Merge branch 'master' into psolve-direct
nrnhines May 29, 2021
c89f5e8
core2nrn vecplay indices
nrnhines May 30, 2021
388ac9f
Update external/mod2c
nrnhines Jun 1, 2021
6002996
PatternStim for direct mode. No special treatment except share Info
nrnhines Jun 4, 2021
80d2985
Moved changes sim/finitialize.cpp to io/nrn2core_data_init.cpp
nrnhines Jun 4, 2021
ce32391
Apply suggestions from code review
nrnhines Jun 7, 2021
d353057
Merge branch 'master' into psolve-direct
nrnhines Jun 7, 2021
0f3b5ec
Merge branch 'master' into psolve-direct
alexsavulescu Jul 2, 2021
9e2e6b9
Merge branch 'master' into psolve-direct
iomaganaris Jul 19, 2021
783c33b
Fixed NEURON_BRANCH setting issue in the CI
iomaganaris Jul 19, 2021
23696aa
Small fix for jenkins
iomaganaris Jul 19, 2021
3b9d942
Update mod2c submodule after PR merge BlueBrain/mod2c/pull/63
pramodk Jul 23, 2021
9477c71
Fixed coreneuron_modtest::spikes_py
iomaganaris Jul 22, 2021
8742e82
Made clang-format happy and updated nmodl submodule
iomaganaris Jul 23, 2021
87f607c
Merge branch 'master' into psolve-direct
pramodk Jul 23, 2021
407333c
Properly initialize mechanisms after copying data to the GPU
iomaganaris Jul 28, 2021
e20052e
Removal of debug print
iomaganaris Jul 30, 2021
074c2e0
Corenrn2nrn WATCH transfer must include above_threshold flag.
nrnhines Jul 31, 2021
9141d70
update to nmodl branch hines/netmove-extra-t
nrnhines Aug 7, 2021
560367d
NMODL submodule -> master.
olupton Aug 9, 2021
9c68299
Merge branch 'master' into psolve-direct
olupton Aug 17, 2021
185a775
Improve GPU TrajectoryRequests support.
olupton Aug 18, 2021
5780019
Fixes for NVHPC 21.2, avoid duplicate copies.
olupton Aug 18, 2021
fd2a000
NVHPC 21.2 compatibility.
olupton Aug 18, 2021
d6b749e
Check the overflow of NetSendBuffer_t from GPU execution
pramodk Aug 18, 2021
0788939
update nmodl submodule to a PR branch temporarily
pramodk Aug 19, 2021
65310d2
updated nmodl to latest master
pramodk Aug 20, 2021
20b5073
Add extra #pragma acc wait.
olupton Aug 20, 2021
6378649
Incorporate code review comments from Alex
pramodk Aug 20, 2021
6318802
make clang-format happy
pramodk Aug 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 73 additions & 51 deletions coreneuron/apps/main1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,46 @@ char* prepare_args(int& argc, char**& argv, int use_mpi, const char* arg) {
namespace coreneuron {
void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init);

// bsize = 0 then per step transfer
// bsize > 1 then full trajectory save into arrays.
void get_nrn_trajectory_requests(int bsize) {
if (nrn2core_get_trajectory_requests_) {
for (int tid = 0; tid < nrn_nthread; ++tid) {
NrnThread& nt = nrn_threads[tid];
int n_pr;
int n_trajec;
int* types;
int* indices;
void** vpr;
double** varrays;
double** pvars;

// bsize is passed by reference, the return value will determine if
// per step return or entire trajectory return.
(*nrn2core_get_trajectory_requests_)(
tid, bsize, n_pr, vpr, n_trajec, types, indices, pvars, varrays);
delete_trajectory_requests(nt);
if (n_trajec) {
TrajectoryRequests* tr = new TrajectoryRequests;
nt.trajec_requests = tr;
tr->bsize = bsize;
tr->n_pr = n_pr;
tr->n_trajec = n_trajec;
tr->vsize = 0;
tr->vpr = vpr;
tr->gather = new double*[n_trajec];
tr->varrays = varrays;
tr->scatter = pvars;
for (int i = 0; i < n_trajec; ++i) {
tr->gather[i] = stdindex2ptr(types[i], indices[i], nt);
}
delete[] types;
delete[] indices;
}
}
}
}

void nrn_init_and_load_data(int argc,
char* argv[],
CheckPoints& checkPoints,
Expand Down Expand Up @@ -167,7 +207,9 @@ void nrn_init_and_load_data(int argc,
set_globals(corenrn_param.datpath.c_str(), (corenrn_param.seed >= 0), corenrn_param.seed);

// set global variables for start time, timestep and temperature
t = checkPoints.restore_time();
if (!corenrn_embedded) {
t = checkPoints.restore_time();
}

if (corenrn_param.dt != -1000.) { // command line arg highest precedence
dt = corenrn_param.dt;
Expand Down Expand Up @@ -267,10 +309,38 @@ void nrn_init_and_load_data(int argc,
report_mem_usage("After mk_spikevec_buffer");
}

// In direct mode there are likely trajectory record requests
// to allow processing in NEURON after simulation by CoreNEURON
if (corenrn_embedded) {
// arg is additional vector size required (how many items will be
// written to the double*) but NEURON can instead
// specify that returns will be on a per time step basis.
get_nrn_trajectory_requests(int((corenrn_param.tstop - t) / corenrn_param.dt) + 2);

// In direct mode, CoreNEURON has exactly the behavior of
// ParallelContext.psolve(tstop). Ie a sequence of such calls
// without an intervening h.finitialize() continues from the end
// of the previous call. I.e., all initial state, including
// the event queue has been set up in NEURON. And, at the end
// all final state, including the event queue will be sent back
// to NEURON. Here there is some first time only
// initialization and queue transfer.
direct_mode_initialize();
(*nrn2core_part2_clean_)();
}

if (corenrn_param.gpu) {
// Copy nrnthreads to device only after all the data are passed from NEURON and the
// nrnthreads on CPU are properly set up
setup_nrnthreads_on_device(nrn_threads, nrn_nthread);
}

if (corenrn_embedded) {
// Run nrn_init of mechanisms only to allocate any extra data needed on the GPU after
// nrnthreads are properly set up on the GPU
allocate_data_in_mechanism_nrn_init();
}

if (nrn_have_gaps) {
nrn_partrans::gap_update_indices();
}
Expand Down Expand Up @@ -332,53 +402,14 @@ std::string cnrn_version() {
return version::to_string();
}

// bsize = 0 then per step transfer
// bsize > 1 then full trajectory save into arrays.
void get_nrn_trajectory_requests(int bsize) {
if (nrn2core_get_trajectory_requests_) {
for (int tid = 0; tid < nrn_nthread; ++tid) {
NrnThread& nt = nrn_threads[tid];
int n_pr;
int n_trajec;
int* types;
int* indices;
void** vpr;
double** varrays;
double** pvars;

// bsize is passed by reference, the return value will determine if
// per step return or entire trajectory return.
(*nrn2core_get_trajectory_requests_)(
tid, bsize, n_pr, vpr, n_trajec, types, indices, pvars, varrays);
delete_trajectory_requests(nt);
if (n_trajec) {
TrajectoryRequests* tr = new TrajectoryRequests;
nt.trajec_requests = tr;
tr->bsize = bsize;
tr->n_pr = n_pr;
tr->n_trajec = n_trajec;
tr->vsize = 0;
tr->vpr = vpr;
tr->gather = new double*[n_trajec];
tr->varrays = varrays;
tr->scatter = pvars;
for (int i = 0; i < n_trajec; ++i) {
tr->gather[i] = stdindex2ptr(types[i], indices[i], nt);
}
delete[] types;
delete[] indices;
}
}
}
}

static void trajectory_return() {
if (nrn2core_trajectory_return_) {
for (int tid = 0; tid < nrn_nthread; ++tid) {
NrnThread& nt = nrn_threads[tid];
TrajectoryRequests* tr = nt.trajec_requests;
if (tr && tr->varrays) {
(*nrn2core_trajectory_return_)(tid, tr->n_pr, tr->vsize, tr->vpr, nt._t);
(*nrn2core_trajectory_return_)(tid, tr->n_pr, tr->bsize, tr->vsize, tr->vpr, nt._t);
}
}
}
Expand Down Expand Up @@ -499,19 +530,10 @@ extern "C" int run_solve_core(int argc, char** argv) {
abort();
}

// In direct mode there are likely trajectory record requests
// to allow processing in NEURON after simulation by CoreNEURON
if (corenrn_embedded) {
// arg is vector size required but NEURON can instead
// specify that returns will be on a per time step basis.
get_nrn_trajectory_requests(int(tstop / dt) + 2);
(*nrn2core_part2_clean_)();
}

// TODO : if some ranks are empty then restore will go in deadlock
// phase (as some ranks won't have restored anything and hence return
// false in checkpoint_initialize
if (!checkPoints.initialize()) {
if (!corenrn_embedded && !checkPoints.initialize()) {
nrn_finitialize(v != 1000., v);
}

Expand Down
49 changes: 18 additions & 31 deletions coreneuron/gpu/nrn_acc_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,14 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
if (!nt->compute_gpu)
return;

// check if nsb->_cnt was exceeded on GPU: as the buffer can not be increased
// during gpu execution, we should just abort the execution.
// \todo: this needs to be fixed with different memory allocation strategy
if (nsb->_cnt > nsb->_size) {
printf("ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n", nrnmpi_myid);
nrn_abort(1);
}

if (nsb->_cnt) {
acc_update_self(nsb->_sendtype, sizeof(int) * nsb->_cnt);
acc_update_self(nsb->_vdata_index, sizeof(int) * nsb->_cnt);
Expand Down Expand Up @@ -685,6 +693,11 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
acc_update_self(nt->weights, sizeof(double) * nt->n_weight);
}

if (nt->n_presyn) {
acc_update_self(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
acc_update_self(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
}

/* dont update vdata, its pointer array
if(nt->_nvdata) {
acc_update_self(nt->_vdata, sizeof(double)*nt->_nvdata);
Expand Down Expand Up @@ -779,6 +792,11 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
acc_update_device(nt->weights, sizeof(double) * nt->n_weight);
}

if (nt->n_presyn) {
acc_update_device(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
acc_update_device(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
}

/* don't and don't update vdata, its pointer array
if(nt->_nvdata) {
acc_update_device(nt->_vdata, sizeof(double)*nt->_nvdata);
Expand All @@ -792,37 +810,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
#endif
}

/**
* Copy voltage vector from GPU to CPU
*
* \todo Currently we are copying all voltage vector from GPU
* to CPU. We need fine-grain implementation to copy
* only requested portion of the voltage vector.
*/
void update_voltage_from_gpu(NrnThread* nt) {
if (nt->compute_gpu && nt->end > 0) {
double* voltage = nt->_actual_v;
int num_voltage = nrn_soa_padded_size(nt->end, 0);
// clang-format off

#pragma acc update host(voltage [0:num_voltage])
// clang-format on
}
}

/**
* @brief Copy fast_imem vectors from GPU to CPU.
*
*/
void update_fast_imem_from_gpu(NrnThread* nt) {
if (nt->compute_gpu && nt->end > 0 && nt->nrn_fast_imem) {
int num_fast_imem = nt->end;
double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
#pragma acc update host(fast_imem_d [0:num_fast_imem], fast_imem_rhs [0:num_fast_imem])
}
}

/**
* Copy weights from GPU to CPU
*
Expand Down
2 changes: 0 additions & 2 deletions coreneuron/gpu/nrn_acc_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ void update_matrix_to_gpu(NrnThread* _nt);
void update_net_receive_buffer(NrnThread* _nt);
void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
void update_voltage_from_gpu(NrnThread* nt);
void update_fast_imem_from_gpu(NrnThread* nt);
void update_weights_from_gpu(NrnThread* threads, int nthreads);
void init_gpu();

Expand Down
Loading