Merge pull request #14933 from sneaxiy/rewrite_ddim

Rewrite ddim

Merge pull request #14933 from sneaxiy/rewrite_ddim
Rewrite ddim
25b49a08 · Zeng Jinle · GitHub · a8bc05b5 · 73896eeb · 25b49a08
31 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,9 +27,10 @@ add_subdirectory(details)
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)

-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)

--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@@ -15,34 +15,123 @@
 #pragma once

 #include <cstdint>
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace framework {
+
 template <typename T, size_t N>
 class Array {
-  static_assert(N > 0, "The size of array must be larger than 0");
-
 public:
-  HOSTDEVICE Array() {}
+  static constexpr size_t kSize = N;
+
+  HOSTDEVICE inline Array() {}

-  HOSTDEVICE explicit Array(const T &val) {
-    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  template <typename... Args>
+  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
+    static_assert(N == sizeof...(Args) + 1, "Invalid argument");
+    UnrollVarArgsAssign<T>::Run(data_, val, args...);
  }

-  HOSTDEVICE const T *Get() const { return data_; }
+  HOSTDEVICE inline void Fill(const T &val) {
+    UnrollFillConstant<N>::Run(data_, val);
+  }

-  HOSTDEVICE T *GetMutable() { return data_; }
+  HOSTDEVICE inline const T *Get() const { return data_; }

-  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+  HOSTDEVICE inline T *GetMutable() { return data_; }

-  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+  HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
+
+  // Writing "return data_[i]" would cause compilation warning/error:
+  // "array subscript is above array bound" in Python 35 CI.
+  // It seems that it is a false warning of GCC if we do not check the bounds
+  // of array index. But for better performance, we do not check in operator[]
+  // like what is in STL. If users want to check the bounds, use at() instead
+  HOSTDEVICE inline const T &operator[](size_t i) const {
+    return *advance(data_, i);
+  }
+
+  HOSTDEVICE inline T &at(size_t i) {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }
+
+  HOSTDEVICE inline const T &at(size_t i) const {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }

  HOSTDEVICE constexpr size_t size() const { return N; }

+  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
+    return UnrollCompare<N>::Run(data_, other.data_);
+  }
+
+  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
+    return !(*this == other);
+  }
+
 private:
+  template <typename U>
+  HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
+    return ptr + i;
+  }
+
  T data_[N];
 };

+template <typename T>
+class Array<T, 0> {
+ public:
+  static constexpr size_t kSize = 0;
+
+  HOSTDEVICE inline Array() {}
+
+  HOSTDEVICE inline void Fill(const T &val) {}
+
+  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
+
+  // Add constexpr to GetMutable() cause warning in MAC
+  HOSTDEVICE inline T *GetMutable() { return nullptr; }
+
+  HOSTDEVICE inline T &operator[](size_t) {
+#ifdef __CUDA_ARCH__
+    static T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE inline const T &operator[](size_t) const {
+#ifdef __CUDA_ARCH__
+    static const T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
+
+  HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
+
+  HOSTDEVICE constexpr size_t size() const { return 0; }
+
+  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
+    return true;
+  }
+
+  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
+    return false;
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -18,312 +18,159 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-/// @cond HIDDEN
-
-template <int i>
-Dim<i> make_dim(const int64_t* d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <>
-Dim<0> make_dim<0>(const int64_t* d) {
-  return Dim<0>(*d);
-}
-
-void make_ddim(DDim& ddim, const int64_t* dims, int n) {
-  switch (n) {
-    case 0:
-      ddim = make_dim<0>(dims);
-      break;
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
-  }
-}
-
-/// @endcond
-
 DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
+  return DDim(dims.begin(), dims.size());
 }

 DDim make_ddim(const std::vector<int64_t>& dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
+  return DDim(dims.data(), dims.size());
 }

 DDim make_ddim(const std::vector<int>& dims) {
-  std::vector<int64_t> res(dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
+  return DDim(dims.data(), dims.size());
 }

-/// @cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
- public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
+struct DDimEqualityVisitor {
+  explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}

  template <int D>
-  int64_t& operator()(Dim<D>& dim) const {
-    return dim[idx_];
+  inline bool operator()(const Dim<D>& self) const {
+    return UnrollCompare<D>::Run(self.Get(), d_);
  }

- private:
-  int idx_;
+  const int64_t* d_;
 };

-class DynamicConstIndexer : public boost::static_visitor<int64_t> {
- public:
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t operator()(const Dim<D>& dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-/// @endcond
-
-int64_t& DDim::operator[](int idx) {
-  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
+bool DDim::operator==(const DDim& d) const {
+  return size() == d.size() &&
+         this->apply_visitor(DDimEqualityVisitor(d.Get()));
 }

-int64_t DDim::operator[](int idx) const {
-  return boost::apply_visitor(DynamicConstIndexer(idx), var);
-}
+bool DDim::operator!=(const DDim& d) const { return !(*this == d); }

-int DDim::size() const { return arity(*this); }
+struct DDimPlusVisitor {
+  explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2)
+      : d1_(d1), d2_(d2) {}

-bool DDim::operator==(DDim d) const {
-  if (var.which() != d.getVar().which()) {
-    return false;
-  } else {
-    std::vector<int64_t> v1 = vectorize(*this);
-    std::vector<int64_t> v2 = vectorize(d);
-
-    for (unsigned int i = 0; i < v1.size(); i++) {
-      if (v1[i] != v2[i]) {
-        return false;
-      }
-    }
-
-    return true;
+  template <int D>
+  inline void operator()(Dim<D>& self) const {
+    UnrollAdd<D>::Run(d1_, d2_, self.GetMutable());
  }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;

-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
+  const int64_t* d1_;
+  const int64_t* d2_;
+};

-  return make_ddim(v3);
+DDim DDim::operator+(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimPlusVisitor(Get(), d.Get()));
+  return ret;
 }

-DDim DDim::operator*(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
+struct DDimMulVisitor {
+  explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2)
+      : d1_(d1), d2_(d2) {}

-  std::vector<int64_t> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
+  template <int D>
+  inline void operator()(Dim<D>& self) const {
+    UnrollMul<D>::Run(d1_, d2_, self.GetMutable());
  }

-  return make_ddim(v3);
+  const int64_t* d1_;
+  const int64_t* d2_;
+};
+
+DDim DDim::operator*(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimMulVisitor(Get(), d.Get()));
+  return ret;
 }

 int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }

-void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
-
-/// @cond HIDDEN
-struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-
-  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
-
-  template <typename T>
-  void operator()(const T& t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<0>& t) {}
-};
-/// @endcond
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }  // NOLINT

 std::vector<int64_t> vectorize(const DDim& ddim) {
-  std::vector<int64_t> result;
-  VectorizeVisitor visitor(result);
-  boost::apply_visitor(visitor, ddim);
+  std::vector<int64_t> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
  return result;
 }

 // NOTE: framework::vectorize converts to type int64_t
 //       which does not fit cudnn inputs.
 std::vector<int> vectorize2int(const DDim& ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
-  std::vector<int> result(temp.begin(), temp.end());
+  std::vector<int> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
  return result;
 }

-struct ProductVisitor : public boost::static_visitor<int64_t> {
+struct ProductVisitor {
  template <int D>
-  int64_t operator()(const Dim<D>& dim) {
+  inline int64_t operator()(const Dim<D>& dim) {
    return product(dim);
  }
 };

 int64_t product(const DDim& ddim) {
-  ProductVisitor visitor;
-  return boost::apply_visitor(visitor, ddim);
+  return ddim.apply_visitor(ProductVisitor());
 }

-struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-  int begin;
-  int end;
-
-  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    PADDLE_ENFORCE(begin < end,
-                   "Begin index must be less than end index in ddim slice.");
-    PADDLE_ENFORCE(begin >= 0,
-                   "Begin index can't be less than zero in ddim slice.");
-  }
-
-  template <int S>
-  void operator()(const Dim<S>& dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-
-  void operator()(const Dim<0>& dim) {
-    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
-  }
-};
-
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int64_t> vec;
-  vec.reserve(end - begin);
-  SliceVectorizeVisitor visitor(vec, begin, end);
-  boost::apply_visitor(visitor, dim);
-  return make_ddim(vec);
+  PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
+                 "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
+                 begin, end, dim.size());
+  // Constructor of DDim would check whether end - begin is valid
+  return DDim(dim.Get() + begin, end - begin);
 }

-/// \cond HIDDEN
-
-struct ArityVisitor : boost::static_visitor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-
-/// \endcond
-
-int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
+int arity(const DDim& d) { return d.size(); }

-/// \cond HIDDEN
-
-struct DDimPrinter : boost::static_visitor<void> {
+struct DDimPrinter {
  std::ostream& os;
  explicit DDimPrinter(std::ostream& os_) : os(os_) {}

-  template <typename T>
-  void operator()(const T& t) {
+  template <int D>
+  void operator()(const Dim<D>& t) {
    os << t;
  }
 };

-/// \endcond
-
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  DDimPrinter printer(os);
-  boost::apply_visitor(printer, ddim);
+  ddim.apply_visitor(DDimPrinter(os));
  return os;
 }

-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
-
 DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  int rank = src.size();
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                    product(slice_ddim(src, num_col_dims, rank))});
+  return DDim({product(slice_ddim(src, 0, num_col_dims)),
+               product(slice_ddim(src, num_col_dims, src.size()))});
 }

-DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
+DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }

 DDim stride(const DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+  DDim strides;
+  strides.rank_ = ddim.size();
  strides[ddim.size() - 1] = 1;
  for (int i = ddim.size() - 2; i >= 0; --i) {
    strides[i] = strides[i + 1] * ddim[i + 1];
  }
-  return framework::make_ddim(strides);
+  return strides;
 }

-DDim stride_numel(const framework::DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+DDim stride_numel(const DDim& ddim) {
+  DDim strides;
+  strides.rank_ = ddim.size();
  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
  for (int i = ddim.size() - 2; i >= 0; --i) {
    strides[i] = strides[i + 1] * ddim[i];
  }
-  return framework::make_ddim(strides);
+  return strides;
 }

 }  // namespace framework

--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -18,62 +18,145 @@ limitations under the License. */
 #include <stdexcept>
 #include <vector>
 #include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"

 namespace paddle {
 namespace framework {

+#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
+  case (rank): {                               \
+    constexpr auto kRank = (rank);             \
+    return (callback);                         \
+  }
+
+#define PADDLE_VISIT_DDIM(rank, callback)    \
+  switch (rank) {                            \
+    PADDLE_VISIT_DDIM_BASE(0, callback);     \
+    PADDLE_VISIT_DDIM_BASE(1, callback);     \
+    PADDLE_VISIT_DDIM_BASE(2, callback);     \
+    PADDLE_VISIT_DDIM_BASE(3, callback);     \
+    PADDLE_VISIT_DDIM_BASE(4, callback);     \
+    PADDLE_VISIT_DDIM_BASE(5, callback);     \
+    PADDLE_VISIT_DDIM_BASE(6, callback);     \
+    PADDLE_VISIT_DDIM_BASE(7, callback);     \
+    PADDLE_VISIT_DDIM_BASE(8, callback);     \
+    PADDLE_VISIT_DDIM_BASE(9, callback);     \
+    default:                                 \
+      PADDLE_THROW("Invalid rank %d", rank); \
+  }
+
+template <typename T1, typename T2>
+inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
+  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
+}
+
 /**
 * \brief A dynamically sized dimension.
 *
 * The number of dimensions must be between [1, 9].
 */
-struct DDim {
-  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
-                         Dim<7>, Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
+class DDim {
+ public:
+  constexpr static int kMaxRank = 9;
+
+  DDim() : rank_(1) { dim_[0] = 0; }

-  DDim() : var(Dim<1>()) {}
+  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
+
+  DDim(const int* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
+
+  DDim(const int64_t* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }

  template <int D>
-  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
+    UnsafeCast<D>() = in;
+  }
+
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
+      : DDim(init_list.begin(), init_list.size()) {}

-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }

  template <int D>
-  DDim& operator=(const Dim<D>& in) {
-    var = in;
+  inline DDim& operator=(const Dim<D>& dim) {
+    rank_ = D;
+    UnsafeCast<D>() = dim;
    return *this;
  }

-  int64_t& operator[](int idx);
-  int64_t operator[](int idx) const;
+  inline int64_t& operator[](int idx) { return dim_[idx]; }
+
+  inline int64_t operator[](int idx) const { return dim_[idx]; }
+
+  inline int64_t& at(int idx) {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }
+
+  inline int64_t at(int idx) const {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }

  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-    return var.apply_visitor(visitor);
+  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
  }

  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
-    return var.apply_visitor(visitor);
+  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) const {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
  }

-  DDimVar getVar() { return var; }
+  bool operator==(const DDim& d) const;
+
+  bool operator!=(const DDim& d) const;
+
+  DDim operator+(const DDim& d) const;

-  bool operator==(DDim d) const;
+  DDim operator*(const DDim& d) const;

-  bool operator!=(DDim d) const;
+  inline const int64_t* Get() const { return dim_.Get(); }

-  DDim operator+(DDim d) const;
+  inline int64_t* GetMutable() { return dim_.GetMutable(); }

-  DDim operator*(DDim d) const;
+  inline int size() const { return rank_; }
+
+ private:
+  template <int D>
+  inline Dim<D>& UnsafeCast() {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<void*>(&dim_);
+    return *reinterpret_cast<Dim<D>*>(p);
+  }
+
+  template <int D>
+  inline const Dim<D>& UnsafeCast() const {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<const void*>(&dim_);
+    return *reinterpret_cast<const Dim<D>*>(p);
+  }

-  int size() const;
+  inline DDim& CopyFrom(const DDim& ddim) {
+    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
+  }
+
+  friend DDim stride(const DDim& ddim);
+  friend DDim stride_numel(const DDim& ddim);
+
+ private:
+  Dim<kMaxRank> dim_;
+  int rank_;
 };

+#undef PADDLE_VISIT_DDIM_BASE
+#undef PADDLE_VISIT_DDIM
+
 /**
 * \brief Make a DDim from std::vector<int64_t>
 *
@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
 DDim make_ddim(std::initializer_list<int64_t> dims);

 int64_t get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
+void set(DDim& dim, int idx, int val);  // NOLINT

 std::vector<int64_t> vectorize(const DDim& ddim);
 std::vector<int> vectorize2int(const DDim& ddim);
@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
 DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
-
-namespace boost {
-
-template <typename T>
-T get(const paddle::framework::DDim& in) {
-  return boost::get<T>(in.var);
-}
-
-}  // namespace boost
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -16,332 +16,184 @@
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
+#include <string>
 #include <type_traits>

+#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"

 namespace paddle {
 namespace framework {

 // Statically sized, statically indexed dimension
-template <int i>
-struct Dim {
-  static constexpr int dimensions = i;
+template <int D>
+class Dim : public Array<int64_t, D> {
+ public:
+  static_assert(D >= 0, "D must be not less than 0");

-  template <typename... Args>
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
+  static constexpr int kRank = D;
+  using BaseClass = Array<int64_t, D>;

-  HOSTDEVICE
-  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  inline Dim(int64_t head, const Dim<D - 1>& tail) {
+    (*this)[0] = head;
+    new (this->GetMutable() + 1) Dim<D - 1>(tail);
+  }

-  HOSTDEVICE
-  Dim() : head(0), tail() {}
+  template <typename... Args>
+  HOSTDEVICE explicit Dim(int64_t head, Args... args)
+      : BaseClass(head, args...) {}

  /** Construct a Dim from a linear index and size.  Uses Fortran order
   * indexing. */
-  HOSTDEVICE
-  Dim(int64_t idx, const Dim<i>& size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
+  HOSTDEVICE Dim(int64_t idx, const Dim<D>& size);

  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
-  Dim(int64_t idx) : head(idx), tail(idx) {}
+  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }

-  HOSTDEVICE
-  bool operator==(const Dim<i>& o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
+  HOSTDEVICE Dim() = default;

  HOST std::string to_string() const;
-
-  int64_t head;
-  Dim<i - 1> tail;
 };

-// Base case specialization
-template <>
-struct Dim<0> {
-  static constexpr int dimensions = 0;
-
-  HOSTDEVICE
-  Dim(int64_t _head) {}
-
-  HOSTDEVICE
-  Dim() {}
-
-  HOSTDEVICE
-  Dim(int idx, const Dim<0>& size) {
-#ifndef __CUDA_ARCH__
-    if (idx > 0) {
-      throw std::invalid_argument("Index out of range.");
-    }
-#else
-    PADDLE_ASSERT(idx == 0);
-#endif
-  }
-
-  HOSTDEVICE
-  bool operator==(const Dim<0>& o) const { return true; }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<0>& o) const { return false; }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
-};
-
-namespace {
-
-// Helper for accessing Dim classes
-template <int i>
-struct DimGetter {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct FortranOrderIndexingConstructorFunctor {
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
+                                    int64_t* out) {
+    out[kStart] = (*idx) % in[kStart];
+    (*idx) /= in[kStart];
+    FortranOrderIndexingConstructorFunctor<kStart + 1, kEnd,
+                                           kStart + 1 == kEnd>::Run(in, idx,
+                                                                    out);
  }
 };

-// Eureka! We found the element!
-template <>
-struct DimGetter<0> {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return d.head;
-  }
+template <int kStart, int kEnd>
+struct FortranOrderIndexingConstructorFunctor<kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
+                                    int64_t* out) {}
 };
+}  // namespace detail

 template <int D>
-HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-
-template <int D>
-HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-
-}  // namespace
-// Static access to constant Dim
-template <int i, int l>
-HOSTDEVICE int64_t get(const Dim<l>& d) {
-  return DimGetter<i>::impl(d);
+HOSTDEVICE Dim<D>::Dim(int64_t idx, const Dim<D>& size) {
+  detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run(
+      size.Get(), &idx, this->GetMutable());
 }

-// Static access to mutable Dim
-template <int i, int l>
-HOSTDEVICE int64_t& get(Dim<l>& d) {
-  return DimGetter<i>::impl(d);
+template <int idx, int D>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim) {
+  return dim[idx];
 }

-// Dynamic access to constant Dim
-template <int l>
-HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
-  return indexer(*this, i);
+template <int idx, int D>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim) {  // NOLINT
+  return dim[idx];
 }

-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
-                                                               int i) {
-  return d[i];
+template <int D>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim, int idx) {
+  return dim[idx];
 }

-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
-                                                                int i) {
-  return d[i];
+template <int D>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim, int idx) {  // NOLINT
+  return dim[idx];
 }

 // Dot product of two dims
-template <int i>
-HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
-}
-
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
-  return 0;
+template <int D>
+HOSTDEVICE inline int64_t linearize(const Dim<D>& a, const Dim<D>& b) {
+  return UnrollProduct<D>::Run(a.Get(), b.Get());
 }

 // Product of a Dim
-template <int i>
-HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
-  return prod * a.head * product(a.tail);
-}
-
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
-  return prod;
+template <int D>
+HOSTDEVICE inline int64_t product(const Dim<D>& a) {
+  return UnrollProduct<D>::Run(a.Get());
 }

 // Is 0 <= idx_i < size_i for all i?
-template <int i>
-HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head) &&
-          contained(idx.tail, size.tail));
-}
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct ContainedFunctor {
+  HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) {
+    return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) &&
+           ContainedFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(idx,
+                                                                       size);
+  }
+};

-// Base case of is 0 <= idx_i < size_i ?
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
-  return true;
+template <int kStart, int kEnd>
+struct ContainedFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static constexpr inline bool Run(const int64_t* idx,
+                                              const int64_t* size) {
+    return true;
+  }
+};
+}  // namespace detail
+
+template <int D>
+HOSTDEVICE inline bool contained(const Dim<D>& idx, const Dim<D>& size) {
+  return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get());
 }

 /**
 * \brief Compute exclusive prefix-multiply of a Dim.
 */
-template <int i>
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-}
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct ExPrefixMulFunctor {
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {
+    kStart == 0 ? out[kStart] = 1 : out[kStart] =
+                                        out[kStart - 1] * in[kStart - 1];
+    detail::ExPrefixMulFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(in,
+                                                                          out);
+  }
+};
+
+template <int kStart, int kEnd>
+struct ExPrefixMulFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {}
+};
+}  // namespace detail

-///\cond HIDDEN
-// Base case of ex_prefix_mul
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> ex_prefix_mul(const Dim<D>& src) {
+  Dim<D> ret;
+  detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable());
+  return ret;
 }
-///\endcond

 /**
 * Add two dimensions together
 */
-template <int i>
-HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> dim_plus(const Dim<D>& a, const Dim<D>& b) {
+  Dim<D> ret;
+  UnrollAdd<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
 }

-template <int i>
-HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
+template <int D>
+HOSTDEVICE inline Dim<D> operator+(const Dim<D>& lhs, const Dim<D>& rhs) {
  return dim_plus(lhs, rhs);
 }

 /**
 * Multiply two dimensions together
 */
-template <int i>
-HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> dim_mult(const Dim<D>& a, const Dim<D>& b) {
+  Dim<D> ret;
+  UnrollMul<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
 }

-template <int i>
-HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
+template <int D>
+HOSTDEVICE Dim<D> operator*(const Dim<D>& lhs, const Dim<D>& rhs) {
  return dim_mult(lhs, rhs);
 }

@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
 * \return Dim object the same size as \p size with normalized strides
 *
 */
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct NormalizeStridesFunctor {
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
+                             int64_t* ret) {
+    ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]);
+    NormalizeStridesFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        size, stride, ret);
+  }
+};

-template <int i>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
-}
-
-///\cond HIDDEN
+template <int kStart, int kEnd>
+struct NormalizeStridesFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
+                             int64_t* ret) {}
+};
+}  // namespace detail

-template <>
-HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
-                                           const Dim<0>& stride) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE Dim<D> normalize_strides(const Dim<D>& size, const Dim<D>& stride) {
+  Dim<D> ret;
+  detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(),
+                                                     ret.GetMutable());
+  return ret;
 }

-///\endcond
-
 /**
 * Helper function to create a Dim
 *
@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
 */

 template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
  return Dim<sizeof...(Args)>(idxes...);
 }

 // Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
-template <int i>
-typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head << ", " << d.tail;
-  return os;
-}
-
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head;
+template <int D>
+inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
+  os << d[0];
+  for (int i = 1; i < D; ++i) {
+    os << ", " << d[i];
+  }
  return os;
 }

@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
  return os;
 }

-template <int i>
-HOST std::string Dim<i>::to_string() const {
+template <int D>
+HOST std::string Dim<D>::to_string() const {
  std::stringstream stream;
-
  stream << *this;
-
  return stream.str();
 }

 template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, const Dim<D>& extents) {
  Dim<D> result;

  for (int i = 0; i < D - 1; ++i) {
@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
  return result;
 }

+template <int D, typename T1, typename T2>
+inline void static_dim_assign(const T1* in, T2* out) {
+  UnrollAssign<D>::Run(in, out);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {

 struct DLContextVisitor : public boost::static_visitor<::DLContext> {
  inline ::DLContext operator()(const platform::CPUPlace &place) const {
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLCPU;
    ctx.device_id = 0;
    return ctx;
@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {

  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLGPU;
    ctx.device_id = place.device;
    return ctx;
@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {

  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLCPUPinned;
    ctx.device_id = 0;
    return ctx;

--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -38,7 +38,7 @@ class DLPackTensor {

  // The shape in DLTensor is defined as int64_t*
  // Add this member to make TVMTensor init without heap allocation
-  ShapeType shape_[9];
+  ShapeType shape_[DDim::kMaxRank];
 };

 }  // namespace framework

--- a/paddle/fluid/framework/unroll_array_ops.h
+++ b/paddle/fluid/framework/unroll_array_ops.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <type_traits>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollFillConstant {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {
+    data[kStart] = val;
+    UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollFillConstant<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAssign {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
+    d2[kStart] = static_cast<Tout>(d1[kStart]);
+    UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAssign<kStart, kEnd, true> {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
+};
+
+template <typename T, size_t kStart, size_t kEnd, bool kStop>
+struct UnrollVarArgsAssignImpl {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
+    static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
+    d[kStart] = val;
+    UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        d, args...);
+  }
+};
+
+template <typename T, size_t kStart, size_t kEnd>
+struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(T *d) {}
+};
+
+template <typename T>
+struct UnrollVarArgsAssign {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, Args... args) {
+    UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
+        d, args...);
+  }
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollCompare {
+  template <typename T>
+  HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
+    return d1[kStart] == d2[kStart] &&
+           UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollCompare<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
+    return true;
+  }
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAdd {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] + d2[kStart];
+    UnrollAdd<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAdd<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollMul {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] * d2[kStart];
+    UnrollMul<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollMul<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollProduct {
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d) {
+    return d[kStart] *
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
+  }
+
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d1, const T *d2) {
+    return d1[kStart] * d2[kStart] +
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollProduct<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d) {
+    return 1;
+  }
+
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) {
+    return 0;
+  }
+};
+
+}  // namespace detail
+
+template <size_t N>
+using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
+
+template <typename T>
+using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
+
+template <size_t N>
+using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAdd = detail::UnrollAdd<0, N, N == 0>;
+
+template <size_t N>
+using UnrollMul = detail::UnrollMul<0, N, N == 0>;
+
+template <size_t N>
+using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/unroll_array_ops_test.cc
+++ b/paddle/fluid/framework/unroll_array_ops_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+bool CheckEquality(const T* p, size_t n, T val) {
+  return std::all_of(p, p + n, [val](const T& v) { return v == val; });
+}
+
+template <int D1, int D2>
+bool FillConstantTestMain() {
+  static_assert(D1 >= D2, "");
+  std::array<int, D1> arr;
+  arr.fill(0);
+
+  UnrollFillConstant<D2>::Run(arr.data(), 1);
+  return CheckEquality(arr.data(), D2, 1) &&
+         CheckEquality(arr.data() + D2, arr.size() - D2, 0);
+}
+
+TEST(unroll_ops, fill_constant) {
+  EXPECT_TRUE((FillConstantTestMain<9, 0>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 1>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 4>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 9>()));
+}
+
+TEST(unroll_ops, assign) {
+  const int a[] = {1, 2, 3, 4, 5};
+  int b[] = {0, 0, 0, 0, 0};
+  UnrollAssign<3>::Run(a, b);
+  EXPECT_EQ(b[0], 1);
+  EXPECT_EQ(b[1], 2);
+  EXPECT_EQ(b[2], 3);
+  EXPECT_EQ(b[3], 0);
+  EXPECT_EQ(b[4], 0);
+}
+
+TEST(unroll_ops, var_args_assign) {
+  int a[] = {0, 0, 0};
+  UnrollVarArgsAssign<int>::Run(a, 1, 2);
+  EXPECT_EQ(a[0], 1);
+  EXPECT_EQ(a[1], 2);
+  EXPECT_EQ(a[2], 0);
+}
+
+TEST(unroll_ops, compare) {
+  int a[] = {1, 2, 3};
+  int b[] = {1, 2, 4};
+  EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
+
+  b[0] = -1;
+  EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
+}
+
+TEST(unroll_ops, add) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollAdd<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] + b[0], c[0]);
+  EXPECT_EQ(a[1] + b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+
+TEST(unroll_ops, mul) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollMul<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] * b[0], c[0]);
+  EXPECT_EQ(a[1] * b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+
+TEST(unroll_ops, product) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+
+  EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
+
+  EXPECT_EQ(UnrollProduct<3>::Run(a, b),
+            a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase {
    OpComment comment;
    PADDLE_ENFORCE(context->HasInput("X"),
                   "Input(X) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
-
    context->SetOutputDim("Out", context->GetInputDim("X"));
    context->ShareLoD("X", "Out");
  }

--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) {
  }
  out->mutable_data<T>(out_dims, context.GetPlace());
  auto x_stride = framework::stride(x->dims());
-  auto out_stride = framework::stride(out->dims());
  auto offsets = GetOffsets(context);
  int64_t offset = 0;
  for (size_t i = 0; i < offsets.size(); ++i) {

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -147,7 +147,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
            ->GetMutable<CudnnRNNCache>();

    auto input_dims = input->dims();
-    auto weight_dims = weight->dims();
    auto init_h_dims = init_h->dims();
    auto init_c_dims = init_c->dims();
    in_grad->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -27,8 +27,8 @@ struct StridedMemcpyFunctor;
 template <typename T>
 struct StridedMemcpyFunctor<T, 0> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
-                  framework::Dim<0> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
    auto place = dev_ctx.GetPlace();
    if (platform::is_cpu_place(place)) {
      auto& cpu_place = boost::get<platform::CPUPlace>(place);
@@ -50,18 +50,18 @@ struct StridedMemcpyFunctor<T, 0> {
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
-                  framework::Dim<1> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
    auto place = dev_ctx.GetPlace();
    if (platform::is_cpu_place(place)) {
      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
    } else {
 #ifdef PADDLE_WITH_CUDA
      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
      auto& cuda_ctx =
          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
                   cuda_ctx.stream());
 #else
      PADDLE_THROW("Paddle is not compiled with GPU");
@@ -73,19 +73,19 @@ struct StridedMemcpyFunctor<T, 1> {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
-                  framework::Dim<Rank> dst_stride, T* dst) const {
-    for (int64_t i = 0; i < dst_dim.head; ++i) {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
+    for (int64_t i = 0; i < dst_dim[0]; ++i) {
      StridedMemcpyFunctor<T, Rank - 1> func;
-      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
-      src += src_stride.head;
-      dst += dst_stride.head;
+      func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst);
+      src += src_stride[0];
+      dst += dst_stride[0];
    }
  }
 };

 template <typename T>
-struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+struct StridedCopyDimVisitor {
  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
                        const framework::DDim& src_stride,
                        const framework::DDim& dst_stride, T* dst)
@@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor<void> {
        dst_stride_(dst_stride),
        dst_(dst) {}

-  template <typename Dim>
-  void operator()(Dim dst_dim) const {
-    Dim src_stride = boost::get<Dim>(src_stride_);
-    Dim dst_stride = boost::get<Dim>(dst_stride_);
-    constexpr int dim = Dim::dimensions;
-    StridedMemcpyFunctor<T, dim> functor;
-    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
+  template <int D>
+  void operator()(const framework::Dim<D>& dst_dim) const {
+    StridedMemcpyFunctor<T, D> functor;
+    functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(),
+            dst_);
  }

  const platform::DeviceContext& dev_ctx_;

--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");

    auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
-    auto gt_classes_dims = ctx->GetInputDim("GtClasses");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
    auto im_info_dims = ctx->GetInputDim("ImInfo");


--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("Variances"),
                   "Input(Variances) shouldn't be null.");

-    auto scores_dims = ctx->GetInputDim("Scores");
-    auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    auto anchors_dims = ctx->GetInputDim("Anchors");
-    auto variances_dims = ctx->GetInputDim("Variances");
-
    ctx->SetOutputDim("RpnRois", {-1, 4});
    ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
  }

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {

    auto anchor_dims = ctx->GetInputDim("Anchor");
    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
    auto im_info_dims = ctx->GetInputDim("ImInfo");
    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
                      "The rank of Input(Anchor) must be 2.");

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {

    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));

    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                      "Rank of first input must >= rank of second input.");

--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel<T> {
    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
    auto* out0 = context.Output<Tensor>("Out");
    Eigen::DSizes<int, Rank> bcast_dims;
-    auto x_dims = in0->dims();
    for (size_t i = 0; i < expand_times.size(); ++i) {
      bcast_dims[i] = expand_times[i];
    }

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -146,7 +146,6 @@ class FCOpKernel : public framework::OpKernel<T> {
    auto w = ctx.Input<Tensor>("W");
    auto bias = ctx.Input<Tensor>("Bias");
    auto output = ctx.Output<Tensor>("Out");
-    auto in_dims = input->dims();
    auto w_dims = w->dims();
    auto out_dims = output->dims();
    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];

--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -241,15 +241,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
  bool use_peepholes = ctx.Attr<bool>("use_peepholes");

-#define INIT_BASE_SIZES                       \
-  auto ids_dims = ids->dims();   /* T x M*/   \
-  auto ids_numel = ids->numel(); /* T x 1*/   \
-  auto wh_dims = wh->dims();     /* D x 4D*/  \
-  const int D = wh_dims[0];                   \
-  const int D2 = D * 2;                       \
-  const int D3 = D * 3;                       \
-  int64_t row_number = embeddings->dims()[0]; \
-  int64_t row_width = embeddings->dims()[1];  \
+#define INIT_BASE_SIZES                                      \
+  auto ids_dims = ids->dims();                   /* T x M*/  \
+  auto ids_numel = framework::product(ids_dims); /* T x 1*/  \
+  auto wh_dims = wh->dims();                     /* D x 4D*/ \
+  const int D = wh_dims[0];                                  \
+  const int D2 = D * 2;                                      \
+  const int D3 = D * 3;                                      \
+  int64_t row_number = embeddings->dims()[0];                \
+  int64_t row_width = embeddings->dims()[1];                 \
  const int D4 = wh_dims[1];

 #define INIT_BASE_INPUT_DATAS                                        \

--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
                   "Input(Logits@GRAD) should not be null.");

    auto pred_dims = ctx->GetInputDim("Logits");
-    auto lab_dims = ctx->GetInputDim("Labels");
    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));

    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);

--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel {
                   "Output(Predicted@GRAD) should not be null.");

    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);


--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -37,9 +37,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  for (int i = 0; i < Rank; i++) {
    permute[i] = axis[i];
  }
-  auto in_dim = in.dims();
-  auto out_dim = out->dims();
-
  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
  auto* dev = context.eigen_device();

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -76,7 +76,6 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
  void operator()(const DeviceContext& context, const framework::Tensor* X,
                  framework::Tensor* Y) {
    auto in_dims = X->dims();
-    auto out_dims = Y->dims();
    const float* in_data = X->data<float>();
    float* out_data = Y->data<float>();
    const int kBatchDim = 0;

--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
                   "Input(Out@Grad) must not be null.");

    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));


--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -147,12 +147,6 @@ class MulGradOp : public framework::OperatorWithKernel {
                   "Input(Out@GRAD) should not be null");
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    auto x_mat_dims = framework::flatten_to_2d(
-        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
-    auto y_mat_dims = framework::flatten_to_2d(
-        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));

    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");

--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel {

    auto x_dims = ctx->GetInputDim("Input");
    auto label_dims = ctx->GetInputDim("Label");
-    auto w_dims = ctx->GetInputDim("Weight");
    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
    if (ctx->HasInput("Bias")) {

--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel<T> {
    out_norm->mutable_data<T>(ctx.GetPlace());

    auto xdim = in_x->dims();
-    auto ndim = out_norm->dims();
    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
    int axis = ctx.Attr<int>("axis");
    if (axis < 0) axis = xdim.size() + axis;

--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
    int rois_num = rois->dims()[0];

    auto in_stride = framework::stride(in_dims);
-    auto roi_stride = framework::stride(rois->dims());
    auto out_stride = framework::stride(out->dims());

    const T* input_data = in->data<T>();

--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
               static_cast<T>(0));

-      auto out_grad_stride = framework::stride(out_grad->dims());
-
      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
        Tensor out_grad_t =
            out_grad->Slice(static_cast<int>(out_lod[0][i]),

--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                          const framework::DDim& dst_stride, T* dst) {
  paddle::operators::detail::StridedCopyDimVisitor<T> func(
      dev_ctx, src, src_stride, dst_stride, dst);
-  boost::apply_visitor(func, dst_dim);
+  dst_dim.apply_visitor(func);
 }

 // Strided numel memory copy from src to dst by the specified axis