Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
953a35d
make thread-specific AD tape pointer much faster (on Mac)
wds15 Jan 14, 2019
9179928
add makefile bits
weberse2 Jan 15, 2019
b4c59c0
Merge branch 'feature/faster-ad-tls' of https://github.com/stan-dev/m…
wds15 Jan 15, 2019
27e9ee9
switch to new thread_local scheme
wds15 Jan 15, 2019
9cc04b0
[Jenkins] auto-formatting by clang-format version 5.0.0-3~16.04.1 (ta…
stan-buildbot Jan 15, 2019
d462e9b
Merge branch 'feature/faster-ad-tls' of https://github.com/stan-dev/m…
wds15 Jan 19, 2019
fef78b5
address PR comments from Sean: introduce init method, remove mutex th…
wds15 Jan 19, 2019
7a3fffe
some more comments on test & makefile alignment
wds15 Jan 19, 2019
710d387
update doc on AutodiffStackSingleton
wds15 Jan 19, 2019
a81d754
add init to all map_rect_reduce flavors
wds15 Jan 19, 2019
a3ee346
add more notes
wds15 Jan 19, 2019
63cc01d
move all map_rect_reduce specialisations into rev in order to always …
wds15 Jan 20, 2019
2757f80
ensure that 4 threads are used during map_rect_concurrent tests
wds15 Jan 20, 2019
8b27125
move map_rect_concurrent definition to rev branch and place init() ca…
wds15 Jan 21, 2019
4a521b9
add missing include
wds15 Jan 21, 2019
223df95
address PR review comments
weberse2 Jan 23, 2019
61e937c
[Jenkins] auto-formatting by clang-format version 5.0.0-3~16.04.1 (ta…
stan-buildbot Jan 23, 2019
198ce4d
Merge branch 'feature/faster-ad-tls' of https://github.com/stan-dev/m…
wds15 Jan 26, 2019
fed0097
make AD tape initialized without an init call for non-threaded case
wds15 Jan 26, 2019
9e15996
align test with new init scheme
wds15 Jan 26, 2019
52fffe9
initialize AD tape within unnamed namespace
wds15 Feb 15, 2019
00c99a0
remove cout which is unexpected for cmdstan and fully disable init calls
wds15 Feb 16, 2019
1178f5f
add unit test which can serve as microbenchmark
wds15 Feb 23, 2019
f8dd3b8
Merge branch 'develop' into feature/faster-ad-tls
syclik Feb 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion make/libraries
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ clean-mpi:

endif

STACK_INSTANTIATION_CPP := $(MATH)stan/math/rev/core/chainablestack_inst.cpp
STACK_INSTANTIATION := $(STACK_INSTANTIATION_CPP:%.cpp=%.o)

$(STACK_INSTANTIATION) : CXXFLAGS += -fPIC

clean-math:
@echo ' cleaning stan-math targets'
$(RM) $(STACK_INSTANTIATION)

############################################################
# Google Test:
# Build the google test library.
Expand All @@ -130,4 +139,4 @@ $(GTEST)/src/gtest-all.o: INC += $(INC_GTEST)
# Clean all libraries

.PHONY: clean-libraries clean-sundials clean-mpi
clean-libraries: clean-sundials clean-mpi
clean-libraries: clean-sundials clean-mpi clean-math
2 changes: 1 addition & 1 deletion make/tests
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test/% : CXXFLAGS += $(CXXFLAGS_GTEST)
test/% : CPPFLAGS += $(CPPFLAGS_GTEST)
test/% : INC += $(INC_GTEST)

test/%$(EXE) : test/%.o $(GTEST)/src/gtest_main.cc $(GTEST)/src/gtest-all.o $(MPI_TARGETS)
test/%$(EXE) : test/%.o $(GTEST)/src/gtest_main.cc $(GTEST)/src/gtest-all.o $(MPI_TARGETS) $(STACK_INSTANTIATION)
$(LINK.cpp) $^ $(LDLIBS) $(OUTPUT_OPTION)

##
Expand Down
68 changes: 1 addition & 67 deletions stan/math/prim/mat/functor/map_rect_concurrent.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

#include <vector>
#include <thread>
#include <future>
#include <cstdlib>

namespace stan {
Expand Down Expand Up @@ -77,72 +76,7 @@ map_rect_concurrent(
const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
job_params,
const std::vector<std::vector<double>>& x_r,
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;

const int num_jobs = job_params.size();
const vector_d shared_params_dbl = value_of(shared_params);
std::vector<std::future<std::vector<matrix_d>>> futures;

auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
const int end = start + size;
std::vector<matrix_d> chunk_f_out;
chunk_f_out.reserve(size);
for (int i = start; i != end; i++)
chunk_f_out.push_back(ReduceF()(
shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
return chunk_f_out;
};

int num_threads = get_num_threads(num_jobs);
int num_jobs_per_thread = num_jobs / num_threads;
futures.emplace_back(
std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));

#ifdef STAN_THREADS
if (num_threads > 1) {
const int num_big_threads = num_jobs % num_threads;
const int first_big_thread = num_threads - num_big_threads;
for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
i < num_threads; ++i, job_start += job_size) {
job_size = i >= first_big_thread ? num_jobs_per_thread + 1
: num_jobs_per_thread;
futures.emplace_back(
std::async(std::launch::async, execute_chunk, job_start, job_size));
}
}
#endif

// collect results
std::vector<int> world_f_out;
world_f_out.reserve(num_jobs);
matrix_d world_output(0, 0);

int offset = 0;
for (std::size_t i = 0; i < futures.size(); ++i) {
const std::vector<matrix_d>& chunk_result = futures[i].get();
if (i == 0)
world_output.resize(chunk_result[0].rows(),
num_jobs * chunk_result[0].cols());

for (const auto& job_result : chunk_result) {
const int num_job_outputs = job_result.cols();
world_f_out.push_back(num_job_outputs);

if (world_output.cols() < offset + num_job_outputs)
world_output.conservativeResize(Eigen::NoChange,
2 * (offset + num_job_outputs));

world_output.block(0, offset, world_output.rows(), num_job_outputs)
= job_result;

offset += num_job_outputs;
}
}
CombineF combine(shared_params, job_params);
return combine(world_output, world_f_out);
}
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);

} // namespace internal
} // namespace math
Expand Down
7 changes: 7 additions & 0 deletions stan/math/prim/mat/functor/map_rect_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ namespace internal {
*
* No higher order output format is defined yet.
*
* In the prim folder we only keep the basic defintion while all
* specialisations are in rev. This is to ensure that the AD tape of
* all thread can be initialized. This includes the variant which has
* no var arguments involved since nested AD operations can be
* performed (as for example is the case when calling the CVODES
* integrator which needs the Jacobian).
*
* @tparam F user functor
* @tparam T_shared_param type of shared parameters
* @tparam T_job_param type of job specific parameters
Expand Down
68 changes: 49 additions & 19 deletions stan/math/rev/core/autodiffstackstorage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,41 @@ namespace stan {
namespace math {

/**
* Provides a thread_local singleton if needed. Read warnings below!
* For performance reasons the singleton is a global static for the
* case of no threading which is returned by a function. This design
* should allow the compiler to apply necessary inlining to get
* Provides a thread_local storage (TLS) singleton if needed. Read
* warnings below! For performance reasons the singleton is a global
* static pointer which is initialized to a constant expression (the
* nullptr). Doing allows for a faster access to the TLS pointer
* instance. If one were to directly initialize the pointer with a
* call to new, the compiler will insert additional code for each
* reference in the code to the TLS. The additional code checks the
* initialization status thus everytime one accesses the TLS, see [4]
* for details and an example which demonstrates this. However, this
* design requires that the pointer must be initialized with the
* init() function before any var's are
* instantiated. Otherwise a segfault will occur. The init() function
* must be called for any thread which wants to use reverse mode
* autodiff facilites.
*
* Furthermore, the declaration as a global (possibly thread local)
* pointer allows the compiler to apply necessary inlining to get
* maximal performance. However, this design suffers from "the static
* init order fiasco"[0]. Anywhere this is used, we must be
* absolutely positive that it doesn't matter when the singleton will
* get initialized relative to other static variables. In exchange,
* we get a more performant singleton pattern for the non-threading
* case. In the threading case we use the defacto standard C++11
* we get a more performant singleton pattern.
*
* Formely, stan-math used in the threading case the defacto standard C++11
* singleton pattern relying on a function wrapping a static local
* variable. This standard pattern is expected to be well supported
* by the major compilers (as its standard), but it does incur some
* performance penalty. There has been some discussion on this; see
* [1] and [2] and the discussions those PRs link to as well.
* [1] and [2] and the discussions those PRs link to as well. The
* current design has a much reduced/almost no performance penalty
* since the access to the TLS is not wrapped in extra function
* calls. Moreover, the thread_local declaration below can be changed
* to the __thread keyword which is a compiler-specific extension of
* g++,clang++&intel and is around for much longer than C++11 such
* that these compilers should support this design just as robust.
*
* These are thread_local only if the user asks for it with
* -DSTAN_THREADS. This is primarily because Apple clang compilers
Expand All @@ -37,6 +57,7 @@ namespace math {
* [2] https://github.com/stan-dev/math/pull/826
* [3]
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
* [4] https://discourse.mc-stan.org/t/thread-performance-penalty/7306/8
*/
template <typename ChainableT, typename ChainableAllocT>
struct AutodiffStackSingleton {
Expand All @@ -61,25 +82,34 @@ struct AutodiffStackSingleton {
explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;

static inline AutodiffStackStorage &instance() {
#ifdef STAN_THREADS
thread_local static AutodiffStackStorage instance_;
#endif
return instance_;
constexpr static inline AutodiffStackStorage &instance() {
return *instance_;
}

#ifndef STAN_THREADS
static AutodiffStackStorage *init() {
if (instance_ == nullptr)
instance_ = new AutodiffStackStorage();
return instance_;
}

private:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shoudl this pointer still be private?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmpf... in principle it can be private right now, yes. In my follow-up on this I do need to access it. We seem to be relatively liberal with access to class members which is why I took out the private. I think it also makes testing easier.

I would opt to leave it as is for developer convenience. In case you think it is important to make it private we can probably do that. How about a comment in the code saying that one should not mess with this?

static AutodiffStackStorage instance_;
static
#ifdef STAN_THREADS
thread_local
#endif
AutodiffStackStorage *instance_;
};

#ifndef STAN_THREADS
template <typename ChainableT, typename ChainableAllocT>
typename AutodiffStackSingleton<ChainableT,
ChainableAllocT>::AutodiffStackStorage
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
#ifdef STAN_THREADS
thread_local
#endif
typename AutodiffStackSingleton<ChainableT,
ChainableAllocT>::AutodiffStackStorage
*AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
#ifdef STAN_THREADS
= nullptr;
#else
= AutodiffStackSingleton<ChainableT, ChainableAllocT>::init();
#endif

} // namespace math
Expand Down
41 changes: 41 additions & 0 deletions stan/math/rev/core/chainablestack.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include <stan/math/rev/core/autodiffstackstorage.hpp>

#include <iostream>

namespace stan {
namespace math {

Expand All @@ -11,6 +13,45 @@ class chainable_alloc;

typedef AutodiffStackSingleton<vari, chainable_alloc> ChainableStack;

/**
* Instantiates an instance of the ChainableStack if not already
* initialized. This function must be called before any autodiff
* variable get's instantiated within any thread which performs
* reverse mode autodiff operations.
*/
/*
static inline ChainableStack::AutodiffStackStorage* init() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we give this a slightly longer name indicating when you need to call it? Maybe something like init_thread_ad()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... my thoughts:

  • It shoould not have "thread" in it's name. After all you must call this in a non-threaded application as well (imagine someone does not rely on our makefiles then the call it init is not automatically included in the code).
  • We never refer to "ad" in our function names at all. So this is not a convention we use.

If you want it more verbose, then how about init_stack() or init_tape()? I prefer stack as this is the naming used in stan-math.

#ifdef STAN_THREADS
return ChainableStack::init();
if (ChainableStack::instance_ == nullptr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this if statement actually get called if the line before it is a return?

ChainableStack::instance_ = new ChainableStack::AutodiffStackStorage();
#endif
return ChainableStack::instance_;
}
*/

namespace {

struct ad_tape_initializer {
ad_tape_initializer() {
// std::cout << "Initializer created" << std::endl;
tape_ = stan::math::ChainableStack::init();
}
~ad_tape_initializer() {
// std::lock_guard<std::mutex> cout_lock(cout_mutex_init);
// std::cout << "Initializer destructed" << std::endl;
}

typedef ChainableStack::AutodiffStackStorage* tape_ptr_t;
thread_local static tape_ptr_t tape_;
};

thread_local ad_tape_initializer::tape_ptr_t ad_tape_initializer::tape_
= ChainableStack::init();
ad_tape_initializer global_initializer;

} // namespace

} // namespace math
} // namespace stan
#endif
18 changes: 18 additions & 0 deletions stan/math/rev/core/chainablestack_inst.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef STAN_MATH_REV_CORE_CHAINABLESTACK_INST_CPP
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get rid of this file if we have a stan::math::init() function or something like that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, we still need the file in place so that the respective code gets executed to initialize the AD stack of the main thread. The alternative is to require that any client code must call init before doing any AD. I think we want to keep the behavior such that there is no change to client code other than using our updated makefiles.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, fair enough.

#define STAN_MATH_REV_CORE_CHAINABLESTACK_INST_CPP

#include <stan/math/rev/core/chainablestack.hpp>

namespace stan {
namespace math {
namespace internal {

/*
const ChainableStack::AutodiffStackStorage* __chainable_stack
= ChainableStack::init();
*/

} // namespace internal
} // namespace math
} // namespace stan
#endif
1 change: 1 addition & 0 deletions stan/math/rev/mat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
#include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
#include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
#include <stan/math/rev/mat/functor/integrate_dae.hpp>
#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
#include <stan/math/rev/mat/functor/map_rect_reduce.hpp>

#endif
Loading