Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e72d17b
b f/i-125 first cut at cholesky derivative structure
rtrangucci Jul 22, 2015
5bb01ab
b f/i-125 analytic cholesky derivative works
rtrangucci Jul 23, 2015
019d64b
b f/i-125 wall-clock about as fast as Eigen, with 1/10th
rtrangucci Jul 25, 2015
eb2cab4
b f/i-125 sparse Eigen version, much slower than scalars
rtrangucci Jul 25, 2015
22146e9
b f/i-125 fastest implementation currently
rtrangucci Jul 27, 2015
e59c4cc
b f/i-125 cholesky-derivative nearly complete for _test.hpp
rtrangucci Jul 28, 2015
3bcbdb6
b f/i-125 indexing fixed
rtrangucci Jul 28, 2015
7851547
b f/i-125 cleaned up file
rtrangucci Jul 28, 2015
0d9e659
b f/i-125 separate gradient rev tests from higher-order tests
rtrangucci Jul 30, 2015
fc5d674
b f/i-125 documentation and speedup, tests
rtrangucci Jul 30, 2015
25b093c
b f/i-125 more doc
rtrangucci Jul 30, 2015
38f5a82
Merge branch 'develop' into feature/issue-125-cholesky-derivative
rtrangucci Jul 30, 2015
f173b77
b f/i-125 fixed cpplint errors
rtrangucci Jul 31, 2015
ef9c518
b f/i-125 cholesky_decompose improvements
rtrangucci Aug 2, 2015
c6e4822
Merge branch 'develop' into feature/issue-125-cholesky-derivative
rtrangucci Aug 2, 2015
41c35ee
Merge branch 'develop' into feature/issue-125-cholesky-derivative
syclik Aug 3, 2015
be37bb7
Merge branch 'develop' into feature/issue-125-cholesky-derivative
syclik Aug 3, 2015
b1749c4
b f/i-125 following changes:
rtrangucci Aug 16, 2015
b152b37
b f/i changes to chain():
rtrangucci Aug 17, 2015
8554a04
b f/i-125 preserves memory locality
rtrangucci Aug 17, 2015
31e5416
Merge branch 'feature/issue-125-cholesky-derivative' of https://githu…
rtrangucci Aug 17, 2015
61678f0
Merge branch 'develop' into feature/issue-125-cholesky-derivative
rtrangucci Aug 17, 2015
3f4f966
really end inner loop in the right place
bgoodri Aug 22, 2015
14815b3
back to non-memory locality preserving input/output
rtrangucci Aug 22, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions stan/math/rev/mat/fun/cholesky_decompose.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#ifndef STAN_MATH_REV_MAT_FUN_CHOLESKY_DECOMPOSE_HPP
#define STAN_MATH_REV_MAT_FUN_CHOLESKY_DECOMPOSE_HPP

#include <stan/math/prim/mat/fun/Eigen.hpp>
#include <stan/math/prim/mat/fun/typedefs.hpp>
#include <stan/math/prim/mat/fun/cholesky_decompose.hpp>
#include <stan/math/rev/scal/fun/value_of_rec.hpp>
#include <stan/math/rev/scal/fun/value_of.hpp>
#include <stan/math/rev/core.hpp>
#include <stan/math/prim/mat/fun/value_of_rec.hpp>
#include <stan/math/prim/mat/err/check_pos_definite.hpp>
#include <stan/math/prim/mat/err/check_square.hpp>
#include <stan/math/prim/mat/err/check_symmetric.hpp>

namespace stan {
namespace math {

class cholesky_decompose_v_vari : public vari {
public:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variables should be private if you're being picky.

int M_; // A.rows() = A.cols()
vari** variRefA_;
vari** variRefL_;

/* ctor for cholesky function
*
* Stores varis for A
* Instantiates and stores varis for L
* Instantiates and stores dummy vari for
* upper triangular part of var result returned
* in cholesky_decompose function call
*
* variRefL aren't on the chainable
* autodiff stack, only used for storage
* and computation. Note that varis for
* L are constructed externally in
* cholesky_decompose.
*
* @param matrix A
* @param matrix L, cholesky factor of A
* */
cholesky_decompose_v_vari(const Eigen::Matrix<var, -1, -1>& A,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should choose either -1 or Eigen::Dynamic and not mix them in one program.

const Eigen::Matrix<double, -1, -1>& L_A)
: vari(0.0),
M_(A.rows()),
variRefA_(ChainableStack::memalloc_.alloc_array<vari*>
(A.rows() * (A.rows() + 1) / 2)),
variRefL_(ChainableStack::memalloc_.alloc_array<vari*>
(A.rows() * (A.rows() + 1) / 2)) {
size_t pos = 0;
for (size_type i = 0; i < M_; ++i) {
for (size_type j = 0; j <= i; ++j) {
variRefA_[pos] = A.coeffRef(i, j).vi_;
variRefL_[pos] = new vari(L_A.coeffRef(i, j), false);
++pos;
}
}
}

/* Reverse mode differentiation
* algorithm refernce:
*
* Mike Giles. An extended collection of matrix
* derivative results for forward and reverse mode AD.
* Jan. 2008.
*
* Note algorithm as laid out in Giles is
* row-major, so Eigen::Matrices are explicitly storage
* order RowMajor, whereas Eigen defaults to
* ColumnMajor. Also note algorithm
* starts by calculating the adjoint for
* A(M_ - 1, M_ - 1), hence pos on line 94 is decremented
* to start at pos = M_ * (M_ + 1) / 2.
* */
virtual void chain() {
using Eigen::Matrix;
using Eigen::RowMajor;
Matrix<double, -1, -1, RowMajor> adjL(M_, M_);
Matrix<double, -1, -1, RowMajor> LA(M_, M_);
Matrix<double, -1, -1, RowMajor> adjA(M_, M_);
size_t pos = 0;
for (size_type i = 0; i < M_; ++i) {
for (size_type j = 0; j <= i; ++j) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer brackets around anythng bigger than a one-liner.

adjL.coeffRef(i, j) = variRefL_[pos]->adj_;
LA.coeffRef(i, j) = variRefL_[pos]->val_;
++pos;
}
}

--pos;
for (int i = M_ - 1; i >= 0; --i) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A more idiomatic way to write this is

for (int i = M_; i-- > 0; ) { ... }

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The more idiomatic way is more confusing to me (personally)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bob-carpenter I'm going to leave this as is, the idiomatic way is confusing to me as well

for (int j = i; j >= 0; --j) {
if (i == j) {
adjA.coeffRef(i, j) = 0.5 * adjL.coeff(i, j)
/ LA.coeff(i, j);
} else {
adjA.coeffRef(i, j) = adjL.coeff(i, j)
/ LA.coeff(j, j);
adjL.coeffRef(j, j) -= adjL.coeff(i, j)
* LA.coeff(i, j) / LA.coeff(j, j);
}
for (int k = j - 1; k >=0; --k) {
adjL.coeffRef(i, k) -= adjA.coeff(i, j)
* LA.coeff(j, k);
adjL.coeffRef(j, k) -= adjA.coeff(i, j)
* LA.coeff(i, k);
}
variRefA_[pos--]->adj_ += adjA.coeffRef(i, j);
}
}
}
};

/* Reverse mode specialization of
* cholesky decomposition
*
* Internally calls llt rather than using
* stan::math::cholesky_decompose in order
* to use selfadjointView<Lower> optimization.
*
* Note chainable stack varis are created
* below in Matrix<var, -1, -1>
*
* @param Matrix A
* @return L cholesky factor of A
*/
Eigen::Matrix<var, -1, -1>
cholesky_decompose(const Eigen::Matrix<var, -1, -1> &A) {
stan::math::check_square("cholesky_decompose", "A", A);
stan::math::check_symmetric("cholesky_decompose", "A", A);

Eigen::Matrix<double, -1, -1> L_A(value_of_rec(A));
Eigen::LLT<Eigen::MatrixXd> L_factor
= L_A.selfadjointView<Eigen::Lower>().llt();
check_pos_definite("cholesky_decompose", "m", L_factor);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't you avoid this check_pos_definite and just look at hte result of the factorization? This test is super expensive.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This actually avoids the cost of check_pos_definite as there is a specialization which works directly on the Eigen::LLT object which is what gets passed in here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for all the clarifications. I didn't catch
the L_factor argument! That's a neat solution.

I've always seen those countdown loops done the way I
wrote it, but it's not big deal either way. It's not
an efficiency issue. Counting up might be, though. I
found big differences profiling in various cases.

  • Bob

On Aug 4, 2015, at 3:23 PM, Marcus Brubaker notifications@github.com wrote:

In stan/math/rev/mat/fun/cholesky_decompose.hpp:

  • *
    
  • \* Note chainable stack varis are created
    
  • \* below in Matrix<var, -1, -1>
    
  • *
    
  • \* @param Matrix A
    
  • \* @return L cholesky factor of A
    
  • */
    
  • Eigen::Matrix<var, Eigen::Dynamic, Eigen::Dynamic>
  • cholesky_decompose(const Eigen::Matrix<var, -1, -1> &A) {
  •  stan::math::check_square("cholesky_decompose", "A", A);
    
  •  stan::math::check_symmetric("cholesky_decompose", "A", A);
    
  •  Eigen::Matrix<double, -1, -1> L_A(value_of_rec(A));
    
  •  Eigen::LLTEigen::MatrixXd L_factor
    
  •    = L_A.selfadjointViewEigen::Lower().llt();
    
  •  check_pos_definite("cholesky_decompose", "m", L_factor);
    

This actually avoids the cost of check_pos_definite as there is a specialization which works directly on the Eigen::LLT object which is what gets passed in here.


Reply to this email directly or view it on GitHub.

L_A = L_factor.matrixL();

// NOTE: this is not a memory leak, this vari is used in the
// expression graph to evaluate the adjoint, but is not needed
// for the returned matrix. Memory will be cleaned up with the
// arena allocator.
cholesky_decompose_v_vari *baseVari
= new cholesky_decompose_v_vari(A, L_A);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why create a whole vari if you're not going to use it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the pattern for matrix derivatives. This vari is the one which actually has chain() called. All the other varis that actually get used below are dummies and don't have their chain() method called because all the work gets done in one shot with this vari. See all mdivide_left and friends.

stan::math::vari dummy(0.0, false);
Eigen::Matrix<var, -1, -1> L(A.rows(), A.cols());
size_t pos = 0;
for (size_type i = 0; i < L.cols(); ++i) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is backward for memory locality. Is there a way to reorganize? Memory locality is huge in terms of efficiency issues.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is one of the matrices that should be row-major; good catch, I'll change the storage order.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I'm not sure if this is a good idea. I don't know Eigen's internals well enough to determine what that'll do to computations done on L. What's the impact of multiplying a row-major storage order matrix by a column major storage order matrix? I'll run some tests to determine if there's any slowdown.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's all about memory locality. What Eigen will do
internally is transpose a column-major matrix with all the necessary
memory allocation and copying because it's faster if you have to
reuse the rows. So basically if the matrix is big enough that the
whole thing won't fit in cache.

On Aug 15, 2015, at 11:27 AM, Rob Trangucci notifications@github.com wrote:

In stan/math/rev/mat/fun/cholesky_decompose.hpp:

  •  Eigen::Matrix<double, -1, -1> L_A(value_of_rec(A));
    
  •  Eigen::LLTEigen::MatrixXd L_factor
    
  •    = L_A.selfadjointViewEigen::Lower().llt();
    
  •  check_pos_definite("cholesky_decompose", "m", L_factor);
    
  •  L_A = L_factor.matrixL();
    
  •  // NOTE: this is not a memory leak, this vari is used in the
    
  •  // expression graph to evaluate the adjoint, but is not needed
    
  •  // for the returned matrix.  Memory will be cleaned up with the
    
  •  // arena allocator.
    
  •  cholesky_decompose_v_vari *baseVari
    
  •    = new cholesky_decompose_v_vari(A, L_A);
    
  •  Eigen::Matrix<var, -1, -1> L(A.rows(), A.cols());
    
  •  size_t pos = 0;
    
  •  for (size_type i = 0; i < L.cols(); ++i) {
    

Now I'm not sure if this is a good idea. I don't know Eigen's internals well enough to determine what that'll do to computations done on L. What's the impact of multiplying a row-major storage order matrix by a column major storage order matrix? I'll run some tests to determine if there's any slowdown.


Reply to this email directly or view it on GitHub.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bob-carpenter I took care of this in the latest version that's testing right now. The changes are to lines 149 through 161 and in the constructor. I'm no longer iterating through any column-major matrix in the wrong order, and any row-major matrices are explicitly declared as such. However, it makes the code a little harder to read because instead of just updating the position in the array of pointers variRefL_ like ++pos, I'm doing the mapping from (i, j) to position in a row-major half-vector.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! The reason to write code that works, get tests in place,
and optimize later is that it's often so damn hard to read optimized
code. Or as the saying goes, it's easier to optimize correct code
than correct optimized code.

  • Bob

On Aug 16, 2015, at 11:53 PM, Rob Trangucci notifications@github.com wrote:

In stan/math/rev/mat/fun/cholesky_decompose.hpp:

  •  Eigen::Matrix<double, -1, -1> L_A(value_of_rec(A));
    
  •  Eigen::LLTEigen::MatrixXd L_factor
    
  •    = L_A.selfadjointViewEigen::Lower().llt();
    
  •  check_pos_definite("cholesky_decompose", "m", L_factor);
    
  •  L_A = L_factor.matrixL();
    
  •  // NOTE: this is not a memory leak, this vari is used in the
    
  •  // expression graph to evaluate the adjoint, but is not needed
    
  •  // for the returned matrix.  Memory will be cleaned up with the
    
  •  // arena allocator.
    
  •  cholesky_decompose_v_vari *baseVari
    
  •    = new cholesky_decompose_v_vari(A, L_A);
    
  •  Eigen::Matrix<var, -1, -1> L(A.rows(), A.cols());
    
  •  size_t pos = 0;
    
  •  for (size_type i = 0; i < L.cols(); ++i) {
    

@bob-carpenter I took care of this in the latest version that's testing right now. The changes are to lines 149 through 161 and in the constructor. I'm no longer iterating through any column-major matrix in the wrong order, and any row-major matrices are explicitly declared as such. However, it makes the code a little harder to read because instead of just updating the position in the array of pointers variRefL_ like ++pos, I'm doing the mapping from (i, j) to position in a row-major half-vector.


Reply to this email directly or view it on GitHub.

for (size_type j = 0; j <= i; ++j)
L.coeffRef(i, j).vi_ = baseVari->variRefL_[pos++];
for (size_type k = (i + 1); k < L.cols(); ++k)
L.coeffRef(i, k).vi_ = &dummy;
}
return L;
}
}
}
#endif
45 changes: 5 additions & 40 deletions test/unit/math/mix/mat/fun/cholesky_decompose_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#include <gtest/gtest.h>
#include <stan/math/fwd/scal/fun/value_of.hpp>
#include <stan/math/rev/scal/fun/value_of.hpp>
#include <stan/math/prim/mat/fun/value_of_rec.hpp>
#include <stan/math/rev/mat/fun/dot_self.hpp>
#include <stan/math/fwd/mat/fun/dot_self.hpp>
#include <stan/math/fwd/scal/fun/value_of_rec.hpp>
#include <stan/math/rev/scal/fun/value_of_rec.hpp>
#include <stan/math/rev/core.hpp>
Expand All @@ -17,10 +20,9 @@
#include <stan/math/fwd/scal/fun/exp.hpp>
#include <stan/math/fwd/scal/fun/fabs.hpp>
#include <stan/math/rev/scal/fun/fabs.hpp>
#include <stan/math/prim/mat/prob/multi_normal_cholesky_log.hpp>
#include <stan/math/prim/mat/fun/cholesky_decompose.hpp>
#include <stan/math/prim/mat/fun/cov_matrix_constrain.hpp>
#include <stan/math/rev/mat/functor/gradient.hpp>
#include <stan/math/prim/mat/functor/finite_diff_gradient.hpp>
#include <stan/math/mix/mat/functor/hessian.hpp>
#include <stan/math/prim/mat/functor/finite_diff_hessian.hpp>
#include <stan/math/mix/mat/functor/grad_hessian.hpp>
Expand All @@ -41,39 +43,6 @@ struct chol_functor {
}
};

void test_gradients(int size) {
std::vector<std::vector<chol_functor> > functowns;
std::vector<std::vector<Eigen::Matrix<double, -1, 1> > > grads_ad;
std::vector<std::vector<Eigen::Matrix<double, -1, 1> > > grads_fd;
Eigen::Matrix<double, -1, -1> evals_ad(size,size);
Eigen::Matrix<double, -1, -1> evals_fd(size,size);
functowns.resize(size);
grads_ad.resize(size);
grads_fd.resize(size);

for (int i = 0; i < size; ++i)
for (int j = 0; j < size; ++j) {
functowns[i].push_back(chol_functor(i, j, size));
grads_fd[i].push_back(Eigen::Matrix<double, -1, 1>(size));
grads_ad[i].push_back(Eigen::Matrix<double, -1, 1>(size));
}

int numels = size + size * (size - 1) / 2;
Eigen::Matrix<double, -1, 1> x(numels);
for (int i = 0; i < numels; ++i)
x(i) = i / 10.0;

for (size_t i = 0; i < static_cast<size_t>(size); ++i)
for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
stan::math::gradient(functowns[i][j], x, evals_ad(i,j), grads_ad[i][j]);
stan::math::finite_diff_gradient(functowns[i][j], x,
evals_fd(i,j), grads_fd[i][j]);
for (int k = 0; k < numels; ++k)
EXPECT_NEAR(grads_fd[i][j](k), grads_ad[i][j](k), 1e-11);
EXPECT_FLOAT_EQ(evals_fd(i, j), evals_ad(i, j));
}
}

void test_hessians(int size) {
std::vector<std::vector<chol_functor> > functowns;
std::vector<std::vector<Eigen::Matrix<double, -1, 1> > > grads_ad;
Expand All @@ -100,7 +69,7 @@ void test_hessians(int size) {
int numels = size + size * (size - 1) / 2;
Eigen::Matrix<double, -1, 1> x(numels);
for (int i = 0; i < numels; ++i)
x(i) = i / 10.0;
x(i) = i / 20.0;

for (size_t i = 0; i < static_cast<size_t>(size); ++i)
for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
Expand Down Expand Up @@ -199,10 +168,6 @@ TEST(AgradMixMatrixCholeskyDecompose, exception_mat_ffv) {
EXPECT_THROW(stan::math::cholesky_decompose(m), std::domain_error);
}

TEST(AgradMixMatrixCholeskyDecompose, mat_1st_deriv) {
test_gradients(3);
}

TEST(AgradMixMatrixCholeskyDecompose, mat_2nd_deriv) {
test_hessians(3);
}
Expand Down
Loading