#ifndef HAVE_TMBAD_HPP
#define HAVE_TMBAD_HPP
// Autogenerated - do not edit by hand !
#include "checkpoint.hpp"
#include "global.hpp"
#include "graph_transform.hpp"

namespace TMBad {

template <class ADFun>
struct Sparse;
template <class ADFun>
struct Decomp2;
template <class ADFun>
struct Decomp3;

namespace {

template <class I>
std::vector<I> cumsum0(const std::vector<bool> &x) {
  std::vector<I> y(x.size(), 0);
  for (size_t i = 1; i < x.size(); i++) {
    y[i] = y[i - 1] + x[i - 1];
  }
  return y;
}
}  // namespace

/** \brief Interoperability with other vector classes

    \details The TMBad interface can handle vector to vector mappings
    specified as *functors*. The evaluation operator is assumed to
    take `std::vector` as both input and output.

    However, it may be that our functor is implemeted using an `other`
    vector class.  In this case the `StdWrap` can help by
    automatically adding the `std::vector` evaluation operator.  To
    make it work one must make sure that conversion operators exist
    for `other`, i.e. CTOR and conversion methods must be implemented
    for the vector class:

    ```
    other(const std::vector<T> &x);
    other::operator std::vector<T>();
    ```

    Then one can proceed by

    ```
    Functor F;
    StdWrap<Functor, other> Fnew(F);
    other x;
    ADFun<> (Fnew, x);
    ```
*/
template <class Functor, class InterfaceVector>
struct StdWrap {
  Functor &F;
  typedef typename InterfaceVector::value_type Scalar;
  InterfaceVector tovec(const InterfaceVector &x) { return x; }
  InterfaceVector tovec(const Scalar &x) {
    InterfaceVector y(1);
    y[0] = x;
    return y;
  }
  StdWrap(Functor &F) : F(F) {}
  template <class T>
  std::vector<T> operator()(const std::vector<T> &x) {
    InterfaceVector xi(x);
    InterfaceVector yi = tovec(F(xi));
    std::vector<T> y(yi);
    return y;
  }
};

/** \brief Configuration parameters for SpJacFun() */
struct SpJacFun_config {
  SpJacFun_config();
  bool compress;
  bool index_remap;
};

/** \brief Automatic differentiation function object

    \details

    The ADFun object represents a mapping \f$F:R^n \rightarrow R^m \: (x
   \rightarrow F(x))\f$

    # API overview

    ### Function object transformations

    | Name                 | Description                             |
    |----------------------|-----------------------------------------|
    | JacFun()             | Get Jacobian function object            |
    | SpJacFun()           | Get Sparse Jacobian function object     |
    | WgtJacFun()          | Get weighted Jacobian function object   |

    ### Function object evaluators

    | Name                 | Description                             |
    |----------------------|-----------------------------------------|
    | operator()()         | Evaluate function object                |
    | Jacobian()           | Evaluate Jacobian of function object    |

    ### Argument conventions

    - Jacobian matrix is stored as a vector with input dimension fastest running
   and output dimension slowest running.
    - Equivalently the Jacobian matrix J is stored row major (OR equivalently
   t(J) stored column major)
    - `keep_x`, `keep_y`: Selects *inputs* and *outputs* to corresponding to
   `J[keep_y, keep_x]`.
*/
template <class ad = ad_aug>
struct ADFun {
  global glob;

  /** \brief Constructor of vector input / vector output function */
  template <class Functor, class ScalarVector>
  ADFun(Functor F, const ScalarVector &x_) : force_update_flag(false) {
    std::vector<ad> x(x_.size());
    for (size_t i = 0; i < x.size(); i++) x[i] = Value(x_[i]);
    global *glob_begin = get_glob();
    this->glob.ad_start();
    Independent(x);
    std::vector<ad> y = F(x);
    Dependent(y);
    this->glob.ad_stop();
    global *glob_end = get_glob();
    TMBAD_ASSERT(glob_begin == glob_end);
  }

  /** \brief Constructor of 1 scalar input / 1 scalar output function
      \warning Experimental - may be removed
  */
  template <class Functor>
  ADFun(Functor F, Scalar x0_) : force_update_flag(false) {
    global *glob_begin = get_glob();
    this->glob.ad_start();
    ad x0(x0_);
    x0.Independent();
    ad y0 = F(x0);
    y0.Dependent();
    this->glob.ad_stop();
    global *glob_end = get_glob();
    TMBAD_ASSERT(glob_begin == glob_end);
  }

  /** \brief Constructor of 2 scalar input / 1 scalar output function
      \warning Experimental - may be removed
  */
  template <class Functor>
  ADFun(Functor F, Scalar x0_, Scalar x1_) : force_update_flag(false) {
    global *glob_begin = get_glob();
    this->glob.ad_start();
    ad x0(x0_);
    x0.Independent();
    ad x1(x1_);
    x1.Independent();
    ad y0 = F(x0, x1);
    y0.Dependent();
    this->glob.ad_stop();
    global *glob_end = get_glob();
    TMBAD_ASSERT(glob_begin == glob_end);
  }

  ADFun() : force_update_flag(false) {}

  void forward() { glob.forward(); }
  void reverse() { glob.reverse(); }
  void clear_deriv() { glob.clear_deriv(); }
  Scalar &deriv_inv(Index i) { return glob.deriv_inv(i); }
  Scalar &deriv_dep(Index i) { return glob.deriv_dep(i); }

  /** \brief Print AD workspace */
  void print(print_config cfg = print_config()) { glob.print(cfg); }

  /** \brief Dead operator elimination */
  void eliminate() { glob.eliminate(); }

  /** \brief Tape optimizer
      \details Does the following two steps
      1. Identical sub-expressions a remapped to their first occurance
      2. Variables that do not affect the end result are removed

      \note Operators with `allow_remap=false` can have lots of
      implicit dependencies which would slow down the optimizer. We
      skip optimization if any such operators are present.
      \note If cached independent variable positions have been set, it
      is illegal to run the tape optimizer because these variables
      could be remapped.
  */
  void optimize() {
    TMBAD_ASSERT2(inv_pos.size() == 0,
                  "Tape has 'cached independent variable positions' which "
                  "would be invalidated by the optimizer");

    std::vector<bool> outer_mask;
    if (inner_outer_in_use()) {
      outer_mask = DomainOuterMask();
    }

    remap_identical_sub_expressions(glob);

    glob.eliminate();

    if (inner_outer_in_use()) {
      TMBAD_ASSERT(outer_mask.size() == Domain());
      set_inner_outer(*this, outer_mask);
    }
  }
  /** \brief Cache tape positions of independent variables.

      An operation sequence may be ordered such that independent
      variables that are *updated frequently* come as *late as
      possible*. By caching the independent variable positions, the
      forward pass can jump directly to the first position that must
      be updated.
  */
  void set_inv_positions() {
    std::vector<Position> pos = inv_positions(glob);
    inv_pos = subset(pos, invperm(order(glob.inv_index)));
  }
  /** \brief Reorder computational graph to allow quick updates of
      selected inputs.

      Permute the computational graph such that the selected
      independent variables `last` come as late as possible. Then
      cache the independent variable positions to allow fast updates
      of the `last` subset.

      \param last *Sorted* index vector of selected independent
      variables satisfying `0 <= last[i] < Domain()`.
  */
  void reorder(std::vector<Index> last) {
    std::vector<bool> outer_mask;
    if (inner_outer_in_use()) {
      outer_mask = DomainOuterMask();
    }
    reorder_graph(glob, last);

    if (inner_outer_in_use()) {
      TMBAD_ASSERT(outer_mask.size() == Domain());
      set_inner_outer(*this, outer_mask);
    }
    set_inv_positions();
  }

  size_t Domain() const { return glob.inv_index.size(); }
  size_t Range() const { return glob.dep_index.size(); }
  /** \brief Which inputs are affecting *some* outputs */
  std::vector<bool> activeDomain() {
    std::vector<bool> mark(glob.values.size(), false);
    for (size_t i = 0; i < glob.dep_index.size(); i++)
      mark[glob.dep_index[i]] = true;
    glob.reverse(mark);
    return subset(mark, glob.inv_index);
  }
  /** \brief Which outputs are affected by *some* inputs */
  std::vector<bool> activeRange() {
    std::vector<bool> mark(glob.values.size(), false);
    for (size_t i = 0; i < glob.inv_index.size(); i++)
      mark[glob.inv_index[i]] = true;
    glob.forward(mark);
    return subset(mark, glob.dep_index);
  }
  /** \brief Get most recent input parameter vector from the tape */
  std::vector<Scalar> DomainVec() {
    std::vector<Scalar> xd(Domain());
    for (size_t i = 0; i < xd.size(); i++) xd[i] = glob.value_inv(i);
    return xd;
  }
  /** \brief Get most recent result vector from the tape */
  IndirectAccessor<Scalar> RangeVec() {
    return IndirectAccessor<Scalar>(glob.values, glob.dep_index);
  }
  /** \brief Get necessary variables to keep for given input/output selection */
  std::vector<bool> get_keep_var(std::vector<bool> keep_x,
                                 std::vector<bool> keep_y) {
    std::vector<bool> keep_var(glob.values.size(), true);
    if (keep_x.size() > 0 || keep_y.size() > 0) {
      if (keep_x.size() == 0) keep_x.resize(glob.inv_index.size(), true);
      if (keep_y.size() == 0) keep_y.resize(glob.dep_index.size(), true);
      TMBAD_ASSERT(keep_x.size() == glob.inv_index.size());
      TMBAD_ASSERT(keep_y.size() == glob.dep_index.size());

      std::vector<bool> keep_var_init(keep_var.size(), false);
      for (size_t i = 0; i < glob.inv_index.size(); i++)
        if (keep_x[i]) keep_var_init[glob.inv_index[i]] = true;
      for (size_t i = 0; i < glob.dep_index.size(); i++)
        if (keep_y[i]) keep_var_init[glob.dep_index[i]] = true;

      std::vector<bool> keep_var_x = keep_var_init;
      glob.forward(keep_var_x);

      std::vector<bool> keep_var_y = keep_var_init;
      glob.reverse(keep_var_y);

      for (size_t i = 0; i < keep_var.size(); i++)
        keep_var[i] = keep_var_x[i] && keep_var_y[i];
    }
    return keep_var;
  }
  /** \brief Vector of positions by independent variables
      \details
      If in use, i.e. `inv_pos.size()>0`, it means that tail sweeping is
     enabled. The vector allows to lookup tape postions for any given
     independent variable. In particular it holds that `inv_index[i] ==
     start[i].ptr.second`. \note `inv_index` need not be sorted !
  */
  std::vector<Position> inv_pos;
  /** \brief Helper to find the tape position of an independent variable */
  Position find_pos(Index inv) {
    for (size_t i = 0; i < inv_pos.size(); i++) {
      if (inv_pos[i].ptr.second == inv) return inv_pos[i];
    }
    return Position(0, 0, 0);
  }
  /** \brief Mark the tail of the operation sequence
      A 'tail sweep' is on the subsequence `tail_start:end`.
      Only used by teh reverse sweep.
  */
  Position tail_start;
  /** \brief Are the independent variable indices consecutive?
      \details Cheep test to guarantee that independent variable
      indicies are consecutive.
  */
  bool inv_index_is_consecutive() {
    if (glob.inv_index.size() == 0) return true;

    bool is_sorted = (inv_pos.size() == 0 && !inner_outer_in_use());
    return is_sorted && (glob.inv_index.size() ==
                         1 + glob.inv_index.back() - glob.inv_index.front());
  }
  /** \brief Set start position needed to get selected independent variable
   * derivatives */
  void set_tail(const std::vector<Index> &random) {
    if (inv_pos.size() > 0) {
      std::vector<Position> pos = subset(inv_pos, random);
      tail_start = *std::min_element(pos.begin(), pos.end());
    } else {
      tail_start = Position(0, 0, 0);
    }
  }
  /** \brief Inactivate tail sweep to get derivatives wrt all independent
   * variables */
  void unset_tail() { tail_start = Position(0, 0, 0); }
  /** \brief Next forward pass must traverse the full graph */
  void force_update() { force_update_flag = true; }
  bool force_update_flag;
  /** \brief Set the input parameter vector on the tape */
  template <class InplaceVector>
  Position DomainVecSet(const InplaceVector &x) {
    TMBAD_ASSERT(x.size() == Domain());
    if (force_update_flag) {
      for (size_t i = 0; i < x.size(); i++) glob.value_inv(i) = x[i];
      force_update_flag = false;
      return Position(0, 0, 0);
    }
    if (inv_pos.size() > 0) {
      if (inner_outer_in_use()) {
        for (size_t i = 0; i < x.size(); i++) glob.value_inv(i) = x[i];
        Index min_inv =
            *std::min_element(glob.inv_index.begin(), glob.inv_index.end());
        return find_pos(min_inv);
      }
      TMBAD_ASSERT(inv_pos.size() == Domain());
      size_t min_var_changed = -1;
      size_t i_min = -1;
      for (size_t i = 0; i < x.size(); i++) {
        if (glob.value_inv(i) != x[i] && glob.inv_index[i] < min_var_changed) {
          min_var_changed = glob.inv_index[i];
          i_min = i;
        }
        glob.value_inv(i) = x[i];
      }
      if (min_var_changed == (size_t)-1)
        return glob.end();
      else
        return inv_pos[i_min];
    }
    if (x.size() > 0) {
      bool no_change = true;
      for (size_t i = 0; i < x.size(); i++) {
        if (glob.value_inv(i) != x[i]) {
          no_change = false;
          break;
        }
      }
      if (no_change) return glob.end();

      for (size_t i = 0; i < x.size(); i++) glob.value_inv(i) = x[i];
    }
    return Position(0, 0, 0);
  }
  /** \brief Forward sweep any vector class */
  template <class Vector>
  Vector forward(const Vector &x) {
    TMBAD_ASSERT((size_t)x.size() == Domain());
    for (size_t i = 0; i < (size_t)x.size(); i++) glob.value_inv(i) = x[i];
    glob.forward();
    Vector y(Range());
    for (size_t i = 0; i < (size_t)y.size(); i++) y[i] = glob.value_dep(i);
    return y;
  }
  /** \brief Reverse sweep any vector class */
  template <class Vector>
  Vector reverse(const Vector &w) {
    TMBAD_ASSERT((size_t)w.size() == Range());
    glob.clear_deriv();
    for (size_t i = 0; i < (size_t)w.size(); i++) glob.deriv_dep(i) = w[i];
    glob.reverse();
    Vector d(Domain());
    for (size_t i = 0; i < (size_t)d.size(); i++) d[i] = glob.deriv_inv(i);
    return d;
  }
  /** \brief Evaluate function for scalar vector input */
  std::vector<Scalar> operator()(const std::vector<Scalar> &x) {
    Position start = DomainVecSet(x);
    glob.forward(start);
    return RangeVec();
  }

  IndirectAccessor<Scalar> operator()(
      const segment_ref<ForwardArgs<Scalar>, x_read> &x) {
    Position start = DomainVecSet(x);
    glob.forward(start);
    return RangeVec();
  }
  /** \brief Evaluate function for ad vector input \details Runs a
      forward replay to current active tape `get_glob()`.  \warning
      There must be an active tape and the ad inputs must correspond
      to the active tape (FIXME: This comment is outdated I think).
  */
  std::vector<ad> operator()(const std::vector<ad> &x_) const {
    std::vector<ad> x(x_.begin(), x_.end());
    TMBAD_ASSERT(x.size() == Domain());
    for (size_t i = 0; i < x.size(); i++) {
      x[i].addToTape();
    }
    global *cur_glob = get_glob();
    for (size_t i = 0; i < x.size(); i++) {
      TMBAD_ASSERT(x[i].on_some_tape());
      TMBAD_ASSERT(x[i].glob() == cur_glob);
    }
    global::replay replay(this->glob, *get_glob());
    replay.start();
    for (size_t i = 0; i < this->Domain(); i++) {
      replay.value_inv(i) = x[i];
    }
    replay.forward(false, false);
    std::vector<ad> y(this->Range());
    for (size_t i = 0; i < this->Range(); i++) {
      y[i] = replay.value_dep(i);
    }
    replay.stop();
    return y;
  }
  /** \brief Evaluate function scalar version \warning Experimental -
      may be removed */
  ad operator()(ad x0) {
    TMBAD_ASSERT(Domain() == 1);
    TMBAD_ASSERT(Range() == 1);
    std::vector<ad> x(1);
    x[0] = x0;
    return (*this)(x)[0];
  }
  /** \brief Evaluate function scalar version \warning Experimental -
      may be removed */
  ad operator()(ad x0, ad x1) {
    TMBAD_ASSERT(Domain() == 2);
    TMBAD_ASSERT(Range() == 1);
    std::vector<ad> x(2);
    x[0] = x0;
    x[1] = x1;
    return (*this)(x)[0];
  }
  /** \brief Evaluate the Jacobian matrix
      \details Denote by f:R^n->R^m this function object.
      The Jacobian matrix is the m-by-n derivative matrix stored **row-major**.
  */
  std::vector<Scalar> Jacobian(const std::vector<Scalar> &x) {
    Position start = DomainVecSet(x);
    glob.forward(start);
    std::vector<Scalar> ans(Domain() * Range());
    for (size_t j = 0; j < Range(); j++) {
      glob.clear_deriv(tail_start);
      glob.deriv_dep(j) = 1;
      glob.reverse(tail_start);
      for (size_t k = 0; k < Domain(); k++)
        ans[j * Domain() + k] = glob.deriv_inv(k);
    }
    return ans;
  }
  /** \brief Evaluate the Jacobian matrix **subset**
      \details Denote by f:R^n->R^m this function object.
      The Jacobian matrix is the m-by-n derivative matrix stored **row-major**.
      This function evaluates `J[keep_y, keep_x]` (`keep_x` / `keep_y`
     cooresponds to input / output respectively)
  */
  std::vector<Scalar> Jacobian(const std::vector<Scalar> &x,
                               std::vector<bool> keep_x,
                               std::vector<bool> keep_y) {
    std::vector<Scalar> ans;

    std::vector<bool> keep_var = get_keep_var(keep_x, keep_y);

    graph G = this->glob.reverse_graph(keep_var, true);

    std::vector<size_t> which_keep_x = which(keep_x);
    std::vector<size_t> which_keep_y = which(keep_y);

    Position start = DomainVecSet(x);
    glob.forward(start);

    for (size_t w = 0; w < which_keep_y.size(); w++) {
      size_t k = which_keep_y[w];

      glob.subgraph_seq.resize(0);
      glob.subgraph_seq.push_back(G.dep2op[k]);
      G.search(glob.subgraph_seq);

      glob.clear_deriv_sub();
      for (size_t l = 0; l < which_keep_x.size(); l++)
        glob.deriv_inv(which_keep_x[l]) = Scalar(0);
      glob.deriv_dep(k) = 1.;
      glob.reverse_sub();

      for (size_t l = 0; l < which_keep_x.size(); l++) {
        ans.push_back(glob.deriv_inv(which_keep_x[l]));
      }
    }
    return ans;
  }
  /** \brief Evaluate the Jacobian matrix multiplied by a vector
      \details Denote by f:R^n->R^m this function object.
      The Jacobian matrix is the m-by-n derivative matrix stored **row-major**.
      This function calculates the derivative d/dx(sum(f(x)*w))
  */
  std::vector<Scalar> Jacobian(const std::vector<Scalar> &x,
                               const std::vector<Scalar> &w) {
    TMBAD_ASSERT(x.size() == Domain());
    TMBAD_ASSERT(w.size() == Range());
    Position start = DomainVecSet(x);
    glob.forward(start);
    glob.clear_deriv();
    for (size_t j = 0; j < Range(); j++) glob.deriv_dep(j) = w[j];
    glob.reverse();
    return IndirectAccessor<Scalar>(glob.derivs, glob.inv_index);
  }

  IndirectAccessor<Scalar> Jacobian(
      const segment_ref<ReverseArgs<Scalar>, x_read> &x,
      const segment_ref<ReverseArgs<Scalar>, dy_read> &w,
      bool clear_all = false, bool deriv_all = false) {
    TMBAD_ASSERT(x.size() == Domain());
    TMBAD_ASSERT(w.size() == Range());
    Position start = DomainVecSet(x);
    Position root = Position(0, 0, 0);
    glob.forward(start);
    glob.clear_deriv(clear_all ? root : tail_start);
    for (size_t j = 0; j < Range(); j++) glob.deriv_dep(j) = w[j];
    glob.reverse(deriv_all ? root : tail_start);
    Index tail_var = deriv_all ? root.ptr.second : tail_start.ptr.second;
    return IndirectAccessor<Scalar>(glob.derivs, glob.inv_index, tail_var);
  }
  std::vector<ad> Jacobian(const std::vector<ad> &x_,
                           const std::vector<ad> &w_) {
    std::vector<ad> x(x_.begin(), x_.end());
    std::vector<ad> w(w_.begin(), w_.end());
    global *cur_glob = get_glob();

    TMBAD_ASSERT(x.size() == Domain());
    for (size_t i = 0; i < x.size(); i++) {
      x[i].addToTape();
    }
    for (size_t i = 0; i < x.size(); i++) {
      TMBAD_ASSERT(x[i].on_some_tape());
      TMBAD_ASSERT(x[i].glob() == cur_glob);
    }

    TMBAD_ASSERT(w.size() == Range());
    for (size_t i = 0; i < w.size(); i++) {
      w[i].addToTape();
    }
    for (size_t i = 0; i < w.size(); i++) {
      TMBAD_ASSERT(w[i].on_some_tape());
      TMBAD_ASSERT(w[i].glob() == cur_glob);
    }

    global::replay replay(this->glob, *get_glob());
    replay.start();
    for (size_t i = 0; i < this->Domain(); i++) {
      replay.value_inv(i) = x[i];
    }
    replay.forward(false, false);
    replay.clear_deriv();
    for (size_t i = 0; i < this->Range(); i++) {
      replay.deriv_dep(i) = w[i];
    }
    replay.reverse(false, false);
    std::vector<ad> dx(this->Domain());
    for (size_t i = 0; i < dx.size(); i++) {
      dx[i] = replay.deriv_inv(i);
    }
    replay.stop();
    return dx;
  }
  template <bool range_weight>
  ADFun JacFun_(std::vector<bool> keep_x, std::vector<bool> keep_y) {
    ADFun ans;
    if (keep_x.size() == 0) keep_x.resize(Domain(), true);
    if (keep_y.size() == 0) keep_y.resize(Range(), true);
    std::vector<bool> keep = get_keep_var(keep_x, keep_y);
    graph G;
    if (!range_weight && Range() > 1) {
      G = this->glob.reverse_graph(keep, true);
    }
    keep = glob.var2op(keep);
    global::replay replay(this->glob, ans.glob);
    replay.start();
    replay.forward(true, false);
    if (!range_weight) {
      if (G.empty()) {
        for (size_t i = 0; i < this->Range(); i++) {
          if (!keep_y[i]) continue;
          replay.clear_deriv();
          replay.deriv_dep(i) = 1.;
          replay.reverse(false, false, tail_start, keep);
          for (size_t j = 0; j < this->Domain(); j++) {
            if (keep_x[j]) replay.deriv_inv(j).Dependent();
          }
        }
      } else {
        replay.clear_deriv();
        for (size_t i = 0; i < this->Range(); i++) {
          if (!keep_y[i]) continue;
          glob.subgraph_seq.resize(0);
          glob.subgraph_seq.push_back(G.dep2op[i]);
          G.search(glob.subgraph_seq);
          replay.deriv_dep(i) = 1.;
          replay.reverse_sub();
          for (size_t j = 0; j < this->Domain(); j++) {
            if (keep_x[j]) replay.deriv_inv(j).Dependent();
          }
          replay.clear_deriv_sub();
        }
      }
    } else {
      replay.clear_deriv();
      replay.reverse(false, true, tail_start, keep);
      for (size_t j = 0; j < this->Domain(); j++) {
        if (keep_x[j]) replay.deriv_inv(j).Dependent();
      }
    }
    replay.stop();
    set_inner_outer(ans);
    return ans;
  }
  /** \brief Get Jacobian function object

      \details Denote by \f$F:R^n \rightarrow R^m\f$ this function object.

      Let

      - `keep_x` denote a boolean subset of inputs (by default all) with
     `n_:=keep_x.count()`
      - `keep_y` denote a boolean subset of outputs (by default all) with
     `m_:=keep_y.count()`

      \return
      New function object \f$F':R^n \rightarrow R^{m\_*n\_}\f$ representing the
     Jacobian (subset).

      \param keep_x If specified, a boolean subset of inputs (Jacobian columns).
      \param keep_y If specified, a boolean subset of outputs (Jacobian rows).

      \note This function preserves the inner/outer parameter categories if in
     use.
  */
  ADFun JacFun(std::vector<bool> keep_x = std::vector<bool>(0),
               std::vector<bool> keep_y = std::vector<bool>(0)) {
    return JacFun_<false>(keep_x, keep_y);
  }
  /** \brief Get **weighted** Jacobian function object

      \details Denote by \f$F:R^n \rightarrow R^m\f$ this function object.

      Let

      - `keep_x` denote a boolean subset of inputs (by default all) with
     `n_:=keep_x.count()`
      - `keep_y` denote a boolean subset of outputs (by default all) with
     `m_:=keep_y.count()`

      \return New function object \f$F':R^{n+m} \rightarrow R^{n\_}
      ((x,w)\rightarrow w^T*F') \f$ representing the Jacobian (subset)
      multiplied by a weight vector.

      \note This function preserves the inner/outer parameter categories if in
     use.
  */
  ADFun WgtJacFun(std::vector<bool> keep_x = std::vector<bool>(0),
                  std::vector<bool> keep_y = std::vector<bool>(0)) {
    return JacFun_<true>(keep_x, keep_y);
  }
  /** \brief Turn this operation sequence into an atomic operator */
  ADFun atomic() {
    std::vector<Scalar> x = DomainVec();
    return ADFun(atomic_raw(), x);
  }
  typedef global::Complete<AtomOp<standard_derivative_table<ADFun> > > AtomOp_t;
  /** \brief Internal use only */
  AtomOp_t atomic_raw() { return *this; }
  /** \brief Parallel split this operation sequence
      Split function `f:R^n->R` by its accumulation tree. Then parallelize
      and accumulate each parallel component. Return a list of functions
      `f[i]:R^n->R` such that `f=sum_i f[i]`.
  */
  std::vector<ADFun> parallel_accumulate(size_t num_threads) {
    TMBAD_ASSERT(Range() == 1);
    global glob_split = accumulation_tree_split(glob);
    autopar ap(glob_split, num_threads);
    ap.do_aggregate = true;
    ap.keep_all_inv = true;
    ap.run();
    ap.extract();
    std::vector<ADFun> ans(num_threads);
    for (size_t i = 0; i < num_threads; i++) ans[i].glob = ap.vglob[i];
    return ans;
  }
  /** \brief Parallelize this operation sequence
      \warning **Reverse** replay is not supported after parallelization.
  */
  ADFun parallelize(size_t num_threads) {
    TMBAD_ASSERT(Range() == 1);
    global glob_split = accumulation_tree_split(glob);
    autopar ap(glob_split, num_threads);
    ap.do_aggregate = true;
    ap.keep_all_inv = false;
    ap.run();
    ap.extract();
    global::Complete<ParalOp> f_parallel(ap);
    ADFun F(f_parallel, DomainVec());
    aggregate(F.glob);
    return F;
  }
  /** \brief Replay this operation sequence to a new sequence
      \details Under rare circumstances this may reduce the tape size, e.g. by
     removing constant operations. \warning This is an experimental feature that
     may be removed.
  */
  void replay() { glob.forward_replay(true, true); }
  /** \brief Sparse Jacobian function generator
      \details Denote by `f:R^n->R^m` this function object.

      By default the return value is a new function object
      f':R^n->R^l representing the *sparse* Jacobian. Here `l`
      denotes the number of non zeros.  The function object is itself
      an `ADFun` object, but in addition the sparsity pattern
      is contained in the output.

      If the Jacobian is only needed on a subset of the sparsity
      pattern, one can use the boolean vectors `keep_x` and `keep_y`
      to select a subset of interest. Jacobian elements outside this
      subset are considered being identical zero, in order to reduce
      the caclulations. Note that indices are not remapped. Also note
      that `keep_x` / `keep_y` cooresponds to input / output
      respectively.

      \param keep_x Jacobian **columns** to consider
      \param keep_y Jacobian **rows** to consider
      \param compress Apply row-wise compression if it reduces memory ?
      \return `Sparse<ADFun>` containing function and sparsity pattern.

      \note This function preserves the inner/outer parameter categories if in
     use.
  */
  Sparse<ADFun> SpJacFun(std::vector<bool> keep_x = std::vector<bool>(0),
                         std::vector<bool> keep_y = std::vector<bool>(0),
                         SpJacFun_config config = SpJacFun_config()) {
    Sparse<ADFun> ans;

    ans.m = Range();
    ans.n = Domain();

    if (keep_x.size() == 0) keep_x.resize(Domain(), true);
    if (keep_y.size() == 0) keep_y.resize(Range(), true);
    std::vector<bool> keep_var = get_keep_var(keep_x, keep_y);

    size_t keep_x_count = std::count(keep_x.begin(), keep_x.end(), true);
    size_t keep_y_count = std::count(keep_y.begin(), keep_y.end(), true);

    graph G = this->glob.reverse_graph(keep_var, true);

    bool do_compress = config.compress;
    compress_helper cprs(this, keep_x, keep_y);

    global::replay replay(this->glob, ans.glob);
    replay.start();
    replay.forward(true, false);

    Index NA = -1;
    std::vector<Index> op2inv_idx = glob.op2idx(glob.inv_index, NA);

    std::fill(keep_var.begin(), keep_var.end(), true);

    std::vector<Index> col_idx;
    for (size_t k = 0; k < glob.dep_index.size(); k++) {
      size_t i = glob.dep_index[k];

      glob.subgraph_seq.resize(0);
      glob.subgraph_seq.push_back(G.dep2op[k]);
      G.search(glob.subgraph_seq);

      if (true) {
        glob.clear_array_subgraph(keep_var);
        keep_var[i] = true;
        glob.reverse_sub(keep_var);
      }

      col_idx.resize(0);
      for (size_t l = 0; l < glob.subgraph_seq.size(); l++) {
        Index idx = op2inv_idx[glob.subgraph_seq[l]];
        if (idx != NA) {
          Index nrep = glob.opstack[glob.subgraph_seq[l]]->output_size();
          for (Index r = 0; r < nrep; r++) {
            if (keep_var[glob.inv_index[idx]]) col_idx.push_back(idx);
            idx++;
          }
        }
      }

      ans.i.resize(ans.i.size() + col_idx.size(), k);
      ans.j.insert(ans.j.end(), col_idx.begin(), col_idx.end());
      if (!do_compress) {
        replay.clear_deriv_sub();

        replay.deriv_dep(k) = 1.;

        replay.reverse_sub();

      } else {
        cprs.initialize(replay);

        cprs.replay_row(replay, k, col_idx);
      }
      for (size_t l = 0; l < col_idx.size(); l++) {
        replay.deriv_inv(col_idx[l]).Dependent();
      }
    }
    replay.stop();
    if (config.index_remap) {
      if (keep_x.size() > 0) {
        std::vector<Index> remap_j = cumsum0<Index>(keep_x);
        ans.j = TMBad::subset(remap_j, ans.j);
        ans.n = keep_x_count;
      }
      if (keep_y.size() > 0) {
        std::vector<Index> remap_i = cumsum0<Index>(keep_y);
        ans.i = TMBad::subset(remap_i, ans.i);
        ans.m = keep_y_count;
      }
    }
    set_inner_outer(ans);
    return ans;
  }
  struct compress_helper {
    ADFun *adf;
    std::vector<bool> &keep_x;
    std::vector<bool> &keep_y;
    Index K;

    AtomOp_t atomic_jac_row;
    std::vector<bool> keep_x_sub;
    std::vector<bool> keep_y_sub;
    std::vector<Replay> vec;
    std::vector<Index> deriv_idx;
    compress_helper(ADFun *adf, std::vector<bool> &keep_x,
                    std::vector<bool> &keep_y)
        : adf(adf), keep_x(keep_x), keep_y(keep_y), K(0) {}
    void initialize(global::replay &replay) {
      if (K == 0) {
        replay.clear_deriv_sub();

        vec = std::vector<Replay>(adf->Domain() + adf->Range(), Replay(0));
        for (size_t i = 0; i < adf->Domain(); i++) {
          vec[i] = replay.value_inv(i);
        }
      }
    }
    Index findNextIndex(size_t k, std::vector<Index> &col_idx) {
      for (size_t i = 0; i < col_idx.size(); i++) {
        Index j = col_idx[i];
        TMBAD_ASSERT2((i == 0) || (j > col_idx[i - 1]),
                      "col_idx must be sorted");
        if (j == k + 1) k = j;
      }
      return k + 1;
    }
    void replay_row(global::replay &replay, size_t k,
                    std::vector<Index> &col_idx) {
      bool first_instance = (k == K);
      if (k == K) {
        K = findNextIndex(k, col_idx);
        if (!keep_y[k]) return;

        keep_y_sub = keep_y;
        for (size_t i = 0; i < keep_y_sub.size(); i++) {
          if ((i < k) || (K <= i)) keep_y_sub[i] = false;
        }
        ADFun jac_row = adf->WgtJacFun(keep_x, keep_y_sub);
        deriv_idx = which<Index>(keep_x);

        keep_x_sub = jac_row.activeDomain();
        jac_row.glob.inv_index = subset(jac_row.glob.inv_index, keep_x_sub);

        keep_y_sub = jac_row.activeRange();
        jac_row.glob.dep_index = subset(jac_row.glob.dep_index, keep_y_sub);
        deriv_idx = subset(deriv_idx, keep_y_sub);

        jac_row.eliminate();

        std::vector<bool> r(keep_x);
        r.resize(keep_x_sub.size(), true);
        r = subset(r, keep_x_sub);
        std::vector<Index> wr = which<Index>(r);
        jac_row.reorder(wr);

        jac_row.set_inv_positions();

        jac_row.set_tail(wr);

        atomic_jac_row = jac_row.atomic_raw();
      }
      bool last_instance = (k == K - 1);
      atomic_jac_row.Op.ctrl.clear_all = last_instance;
      atomic_jac_row.Op.ctrl.deriv_all = first_instance;
      vec[adf->Domain() + k] = 1.;
      std::vector<Replay> vec_sub = subset(vec, keep_x_sub);
      vec[adf->Domain() + k] = 0.;
      std::vector<Replay> r = atomic_jac_row(vec_sub);
      size_t I = atomic_jac_row.output_size();
      for (size_t i = 0; i < I; i++) {
        replay.deriv_inv(deriv_idx[i]) = r[i];
      }
    }
  };
  /** \brief Integrate as many univariate variables as possible
      \note Use `activeDomain()` to identify which variables have been
      integrated (integrated variables are no longer active).
  */
  ADFun marginal_gk(const std::vector<Index> &random,
                    gk_config cfg = gk_config()) {
    ADFun ans;
    old_state os(this->glob);
    aggregate(this->glob, -1);
    global glob_split = accumulation_tree_split(this->glob);
    os.restore();
    integrate_subgraph<ADFun> i_s(glob_split, random, cfg);
    ans.glob = i_s.gk();
    aggregate(ans.glob, -1);
    return ans;
  }
  /** \brief Integrate using sequential reduction */
  ADFun marginal_sr(const std::vector<Index> &random, std::vector<sr_grid> grid,
                    const std::vector<Index> &random2grid, bool perm = true) {
    ADFun ans;
    old_state os(this->glob);
    aggregate(this->glob, -1);
    global glob_split = accumulation_tree_split(this->glob);
    os.restore();
    sequential_reduction SR(glob_split, random, grid, random2grid, perm);
    ans.glob = SR.marginal();
    aggregate(ans.glob, -1);
    return ans;
  }
  /** \brief Integrate using sequential reduction */
  ADFun marginal_sr(const std::vector<Index> &random,
                    sr_grid grid = sr_grid()) {
    return marginal_sr(random, std::vector<sr_grid>(1, grid),
                       std::vector<Index>(0));
  }
  /** \brief Construct function composition
      \details Given this function (f) and another function (g) construct the
     composition f(g(x)).
  */
  ADFun compose(ADFun other) {
    TMBAD_ASSERT2(other.Range() == this->Domain(),
                  "Compostion of incompatible functions");
    struct composition {
      const ADFun &f;
      const ADFun &g;
      composition(const ADFun &f, const ADFun &g) : f(f), g(g) {}
      std::vector<ad> operator()(std::vector<ad> x) { return f(g(x)); }
    };
    composition fg(*this, other);
    return ADFun(fg, other.DomainVec());
  }
  /** \brief Decompose this computational graph
      \note This function preserves the inner/outer parameter categories if in
     use.
  */
  Decomp2<ADFun> decompose(std::vector<Index> nodes) {
    Decomp2<ADFun> ans;
    global &glob1 = ans.first.glob;
    global &glob2 = ans.second.glob;

    OperatorPure *invop = glob.getOperator<global::InvOp>();
    std::vector<bool> keep(nodes.size(), true);
    for (size_t i = 0; i < nodes.size(); i++)
      if (glob.opstack[nodes[i]] == invop) keep[i] = false;
    nodes = subset(nodes, keep);

    glob1 = this->glob;
    glob1.dep_index.resize(0);
    std::vector<Index> dep1 = glob1.op2var(nodes);
    glob1.ad_start();
    for (size_t i = 0; i < dep1.size(); i++) {
      ad_plain tmp;
      tmp.index = dep1[i];
      tmp.Dependent();
    }
    glob1.ad_stop();
    glob1.eliminate();

    glob2 = this->glob;
    substitute(glob2, nodes);
    glob2.eliminate();

    set_inner_outer(ans.first);
    set_inner_outer(ans.second);

    return ans;
  }
  /** \brief Decompose this computational graph by operator name
      \note This function preserves the inner/outer parameter categories if in
     use.
  */
  Decomp2<ADFun> decompose(const char *name) {
    std::vector<Index> nodes = find_op_by_name(this->glob, name);
    return decompose(nodes);
  }
  /** \brief Optional optimization step before resolving references
      \details Replay all RefOp-only dependent sub-expression to active glob.
      Expand this ADFun with boundary variables.
  */
  void decompose_refs() {
    if (find_op_by_name(glob, "RefOp").size() == 0) return;

    std::vector<bool> keep_x(Domain(), true);
    std::vector<bool> keep_y(Range(), true);
    std::vector<bool> vars = get_keep_var(keep_x, keep_y);

    vars = reverse_boundary(glob, vars);

    std::vector<Index> nodes = which<Index>(glob.var2op(vars));

    Decomp2<ADFun> decomp = decompose(nodes);

    size_t n_inner = decomp.first.Domain();
    size_t n_outer = decomp.first.Range();

    decomp.first.glob.inv_index.resize(0);

    std::vector<ad_aug> empty;
    std::vector<ad_aug> gx = decomp.first(empty);

    ADFun &f = decomp.second;

    f.replay();

    TMBAD_ASSERT(n_inner + n_outer == f.Domain());
    TMBAD_ASSERT(find_op_by_name(f.glob, "RefOp").size() == 0);
    TMBAD_ASSERT(find_op_by_name(f.glob, "InvOp").size() == f.Domain());
    TMBAD_ASSERT(gx.size() == n_outer);

    for (size_t i = 0; i < n_outer; i++) {
      Index j = f.glob.inv_index[n_inner + i];

      if (gx[i].constant()) {
        f.glob.opstack[j] = glob.getOperator<global::ConstOp>();
      } else {
        f.glob.opstack[j] = glob.getOperator<global::RefOp>(
            gx[i].data.glob, gx[i].taped_value.index);
      }
    }
    f.glob.inv_index.resize(n_inner);

    *this = f;
  }
  /** \brief Resolve references of this ADFun object
      \details
      Assume that an active context (glob) exists.
      1. Locate all 'RefOp' and play them on top of the active glob while
     storing the corresponding generated variables in a vector.
      2. Substitute all 'RefOp' by 'InvOp' and store the 'inv_index' of the
     newly generated independent variables. \return Vector with variables
     relative to the current active context (glob).
  */
  std::vector<ad_aug> resolve_refs() {
    TMBAD_ASSERT2(
        inner_inv_index.size() == 0 && outer_inv_index.size() == 0,
        "'resolve_refs' can only be run once for a given function object")

        ;
    std::vector<Index> seq = find_op_by_name(glob, "RefOp");
    std::vector<Replay> values(seq.size());
    std::vector<Index> dummy_inputs;
    ForwardArgs<Replay> args(dummy_inputs, values);
    for (size_t i = 0; i < seq.size(); i++) {
      TMBAD_ASSERT(glob.opstack[seq[i]]->input_size() == 0);
      TMBAD_ASSERT(glob.opstack[seq[i]]->output_size() == 1);
      glob.opstack[seq[i]]->forward_incr(args);
      glob.opstack[seq[i]]->deallocate();
      glob.opstack[seq[i]] = get_glob()->getOperator<global::InvOp>();
    }
    inner_inv_index = glob.inv_index;
    outer_inv_index = glob.op2var(seq);

    glob.inv_index.insert(glob.inv_index.end(), outer_inv_index.begin(),
                          outer_inv_index.end());
    return values;
  }
  std::vector<Index> inner_inv_index;
  std::vector<Index> outer_inv_index;
  /** \brief Number of inner parameters */
  size_t DomainInner() const { return inner_inv_index.size(); }
  /** \brief Number of outer parameters */
  size_t DomainOuter() const { return outer_inv_index.size(); }
  /** \brief Temporarily regard this object as function of inner parameters
      \warning Don't forget to swap back when done!
  */
  void SwapInner() {
    std::swap(glob.inv_index, inner_inv_index);
    force_update();
  }
  /** \brief Temporarily regard this object as function of outer parameters
      \warning Don't forget to swap back when done!
  */
  void SwapOuter() {
    std::swap(glob.inv_index, outer_inv_index);
    force_update();
  }
  /** \brief Helper: Does tape have inner/outer information ? */
  bool inner_outer_in_use() {
    return (DomainInner() > 0) || (DomainOuter() > 0);
  }
  /** \brief Helper: Boolean mask of outer parameters */
  std::vector<bool> DomainOuterMask() {
    std::vector<bool> mark_outer =
        glob.mark_space(glob.values.size(), outer_inv_index);
    return subset(mark_outer, glob.inv_index);
  }
  /** \brief Helper: Pass on inner/outer information to a new tape.
      Some parameters are marked as 'outer parameters'. All other
      prameters are 'inner'.  This function passes on inner/outer
      information to a new tape assuming that the new tape **preserves
      parameter ordering** but **may increase or reduce the parameter
      list**.
  */
  void set_inner_outer(ADFun &ans, const std::vector<bool> &outer_mask) {
    if (inner_outer_in_use()) {
      std::vector<bool> mark(outer_mask);
      mark.resize(ans.Domain(), false);

      ans.outer_inv_index = subset(ans.glob.inv_index, mark);

      mark.flip();

      ans.inner_inv_index = subset(ans.glob.inv_index, mark);
    }
  }
  void set_inner_outer(ADFun &ans) {
    if (inner_outer_in_use()) {
      set_inner_outer(ans, DomainOuterMask());
    }
  }
  void DomainReduce(const std::vector<bool> &inv_keep) {
    std::vector<bool> outer_mask = DomainOuterMask();
    outer_mask = subset(outer_mask, inv_keep);
    glob.inv_index = subset(glob.inv_index, inv_keep);
    set_inner_outer(*this, outer_mask);
  }
  /** \brief Substitute selected operators by void operators
      \param nodes Selected operator nodes
      \details This function was added as a way to get rid of protected
     operators.
  */
  void inactivate(std::vector<Index> nodes) {
    for (size_t i = 0; i < nodes.size(); i++) {
      OperatorPure *op = glob.opstack[nodes[i]];
      glob.opstack[nodes[i]] = glob.getOperator<global::NullOp2>(
          op->input_size(), op->output_size());
      op->deallocate();
    }
  }
};
/** \brief Construct ADFun that automatically retapes

    By default, retaping takes place whenever the parameter vector
    changes compared to previous evaluation. However, this can be
    controlled in more detail by passing a custom tester object - see
    the prototype `ParametersChanged`.

    \param F Functor to be taped
    \param x Evaluation point
    \tparam Test Class to test if parameters have changed
*/
template <class Functor, class Test = ParametersChanged>
ADFun<> ADFun_retaping(Functor &F, const std::vector<ad_aug> &x,
                       Test test = Test()) {
  typedef retaping_derivative_table<Functor, ADFun<>, Test> DTab;
  global::Complete<AtomOp<DTab> > Op(F, x, test);
  return ADFun<>(Op, x);
}

/** \brief Container of ADFun object with packed input and output */
template <class dummy = void>
struct ADFun_packed {
  ADFun<> Fp;
  ADFun_packed(const ADFun<> &Fp) : Fp(Fp) {}
  ADFun_packed() {}
  ad_segment operator()(const std::vector<ad_segment> &x) {
    std::vector<ad_segment> xp(x.size());
    for (size_t i = 0; i < xp.size(); i++) xp[i] = pack(x[i], true);
    std::vector<ad_aug> yp = Fp(concat(xp));
    return unpack(yp, 0);
  }
  bool initialized() { return Fp.Domain() != 0; }
};
/** \copydoc ADFun_retaping
    \details
    The resulting object is `ADFun_packed`, i.e. it has *packed*
    inputs **and** outputs. Such packed I/O can compactly represent
    e.g. matrices, vectors or other large objects with a consequtive
    memory layout.
*/
template <class Functor, class Test>
ADFun_packed<> ADFun_retaping(Functor &F, const std::vector<ad_segment> &x,
                              Test test) {
  static const bool packed = true;
  typedef retaping_derivative_table<PackWrap<Functor>, ADFun<>, PackWrap<Test>,
                                    packed>
      DTab;
  PackWrap<Functor> Fp(F);
  std::vector<ad_segment> xp(x.size());
  for (size_t i = 0; i < xp.size(); i++) xp[i] = pack(x[i], true);
  std::vector<ad_aug> xp_ = concat(xp);
  PackWrap<Test> testp(test);
  global::Complete<AtomOp<DTab> > Op(Fp, xp_, testp);
  ADFun<> TapeFp(Op, xp_);
  return ADFun_packed<>(TapeFp);
}

template <class ADFun>
struct Sparse : ADFun {
  std::vector<Index> i;
  std::vector<Index> j;
  Index m;
  Index n;
  Sparse() {}
  Sparse(const ADFun &f) : ADFun(f) {}
  std::vector<Index> a2v(const std::valarray<Index> &x) const {
    return std::vector<Index>(&x[0], &x[0] + x.size());
  }
  std::valarray<Index> v2a(const std::vector<Index> &x) const {
    return std::valarray<Index>(x.data(), x.size());
  }
  std::valarray<Index> row() const { return v2a(i); }
  std::valarray<Index> col() const { return v2a(j); }
  void subset_inplace(const std::valarray<bool> &x) {
    i = a2v(row()[x]);
    j = a2v(col()[x]);
    this->glob.dep_index = a2v(v2a(this->glob.dep_index)[x]);
  }
  void transpose_inplace() {
    std::swap(i, j);
    std::swap(m, n);
  }
};

/** \brief Decomposition of computational graph
    \details This structure holds a decomposition (f,g) of a computational graph
   of the form x -> f(x, g(x)) mapping R^n to R^m.
    - The member **first** represents the mapping g:R^n -> R^k
    - The member **second** represents the mapping f:R^(n+k) -> R^m
*/
template <class ADFun>
struct Decomp2 : std::pair<ADFun, ADFun> {
  struct composition {
    typedef ad_aug ad;
    const ADFun &f;
    const ADFun &g;
    composition(const ADFun &f, const ADFun &g) : f(f), g(g) {}
    std::vector<ad> operator()(std::vector<ad> x) {
      std::vector<ad> y = g(x);
      x.insert(x.end(), y.begin(), y.end());
      return f(x);
    }
  };
  operator ADFun() {
    ADFun &g = this->first;
    ADFun &f = this->second;
    composition fg(f, g);
    return ADFun(fg, g.DomainVec());
  }
  /** \brief Calculate a **sparse plus low rank** representation of the Hessian
      \details Recall that the present object represents a composition f(x,
     g(x)). Its Hessian is a sum of five terms:

      \f[
      \nabla_x^2 f(x, g(x)) = f''_{xx}(x, g(x)) + f''_{xy}(x, g(x)) g'(x)
      + g'(x) f''_{yx}(x, g(x)) + f'_y(x, g(x)) g''(x) + g'(x) f''_{yy}(x, g(x))
     g'(x) \f]

      The result is a structure of three components

      1. The first four terms are represented by a sparse matrix H
      2. The g' of the last term represented by a sparse matrix
      3. f''_{yy} represented by a sparse matrix

      \param keep_rc Rows/cols to keep. If used this mask selects a subset of x.
      \param sparse_1 Return sparse first component
      \param sparse_2 Return sparse second component
      \param sparse_3 Return sparse third component

      \note This function preserves the inner/outer parameter categories if in
     use.
  */
  Decomp3<ADFun> HesFun(std::vector<bool> keep_rc = std::vector<bool>(0),
                        bool sparse_1 = true, bool sparse_2 = true,
                        bool sparse_3 = true) {
    ADFun &g = this->first;
    ADFun &f = this->second;
    Decomp3<ADFun> ans;
    TMBAD_ASSERT(f.Range() == 1);

    std::vector<bool> keep_f = std::vector<bool>(f.Range(), true);
    std::vector<bool> keep_g = std::vector<bool>(g.Range(), true);

    typedef ad_aug ad;
    global &glob = ans.first.glob;
    glob.ad_start();
    std::vector<Scalar> x_ = f.DomainVec();
    size_t k = g.Range();
    size_t n = f.Domain() - k;

    std::vector<bool> mask_x(f.Domain(), false);
    for (size_t i = 0; i < n; i++) mask_x[i] = true;
    std::vector<bool> mask_s(mask_x);
    mask_s.flip();

    std::vector<ad> x(x_.begin(), x_.end() - k);
    Independent(x);
    std::vector<ad> s = g(x);
    std::vector<ad> s0(s.size());

    for (size_t i = 0; i < s.size(); i++) s0[i] = s[i].copy0();
    std::vector<ad> xs(x);
    xs.insert(xs.end(), s.begin(), s.end());
    std::vector<ad> xs0(x);
    xs0.insert(xs0.end(), s0.begin(), s0.end());
    if (false) {
      TMBAD_ASSERT(keep_rc.size() == n || keep_rc.size() == 0);
      std::vector<bool> keep_xy(keep_rc);
      keep_xy.resize(f.Domain(), true);
      ADFun f_grad = f.JacFun(keep_xy, keep_f);
    }
    ADFun f_grad = f.JacFun();
    std::vector<ad> z = subset(f_grad(xs), mask_x);
    std::vector<ad> z0 = subset(f_grad(xs0), mask_s);
    std::vector<ad> xw(x);
    xw.insert(xw.end(), z0.begin(), z0.end());
    std::vector<ad> z1 = g.WgtJacFun()(xw);
    for (size_t i = 0; i < n; i++) z[i] += z1[i];
    Dependent(z);
    glob.ad_stop();
    glob.eliminate();
    ans.first.glob = glob;

    if (sparse_1) {
      ans.first = ans.first.SpJacFun(keep_rc, keep_rc);
    } else {
      ans.first = ans.first.JacFun(keep_rc, keep_rc);
    }
    ans.first.glob.eliminate();
    f.set_inner_outer(ans.first);

    if (sparse_2) {
      ans.second = g.SpJacFun(keep_rc);
    } else {
      ans.second = g.JacFun(keep_rc);
    }
    ans.second.glob.eliminate();

    Sparse<ADFun> B;
    if (sparse_3) {
      B = f_grad.SpJacFun(mask_s, mask_s);
    } else {
      B = f_grad.JacFun(mask_s, mask_s);
    }
    ans.third.glob.ad_start();
    std::vector<ad> xx(x_.begin(), x_.end() - k);
    Independent(xx);
    s = g(xx);
    xs = xx;
    xs.insert(xs.end(), s.begin(), s.end());
    z = B(xs);
    Dependent(z);
    ans.third.glob.ad_stop();
    ans.third.glob.eliminate();
    ans.third.i = B.i;
    ans.third.j = B.j;
    f.set_inner_outer(ans.third);

    return ans;
  }
};

/** \brief Decomposition of computational graph
    \details This structure holds a decomposition (H,grad(g),H0) of *the
   Hessian* of a computational graph of the form x -> f(x, g(x)) mapping R^n to
   R^m. The Hessian of the original function is given by H +
   grad(g)*H0*grad(g)^T where
    - The member **first** holds a tape of the sparse matrix H
    - The member **second** holds the tape of the sparse Jacobian grad(g)
    - The member **third** holds the tape of the dense matrix H0
*/
template <class ADFun>
struct Decomp3 : Decomp2<Sparse<ADFun> > {
  Sparse<ADFun> third;
};

}  // namespace TMBad
#endif  // HAVE_TMBAD_HPP
