diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 6788cb34fbaf5941cbb1537c7a83577c623bf76a..b4458eb9551724021636b628c5bf8c96f6e659aa 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -29,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc DEPS glog)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
-cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto)
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
 cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
index 376268888e70b0a70060c81384f79f8bf5d6dcc5..58780e386353845e380590f20540314dcd021649 100644
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/data_transform.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
@@ -23,5 +24,83 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
   return data_transform_map;
 }
 
+auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNCHW, LibraryType::kPlain);
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+
+  auto dims = src.dims();
+  dst->Resize(dims);
+  auto dst_type = kernel_pair.second.data_type_;
+  auto src_type = kernel_pair.first.data_type_;
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+void TransDataLayout(const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+
+  dst->Resize(src.dims());
+  auto place = kernel_pair.second.place_;
+  CopyFrom(src, place, *ctx, dst);
+  const std::vector<int> axis = {0, 2, 3, 1};
+
+  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis));
+
+  dst->set_layout(kernel_pair.second.data_layout_);
+}
+
 }  // namespace framework
 }  // namespace paddle
+
+namespace f = paddle::framework;
+REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
+REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout);
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
index bd6d301c12e0611c5b01c3ff58869dbeb96b268e..9abb3c99bf30fcf9deab59dc7ee9c02e7c7c775b 100644
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -21,16 +21,20 @@ limitations under the License. */
 #include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/macros.h"
+#include "paddle/platform/transform.h"
 
 namespace paddle {
 namespace framework {
 
-using DataTransformFn = std::function<void(const platform::DeviceContext* ctx,
-                                           const Variable& in, Variable* out)>;
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
 
+using DataTransformFn =
+    std::function<void(const platform::DeviceContext*, const KernelTypePair&,
+                       const Variable&, Variable*)>;
+
 struct KernelTypePairHash {
   static void HashCombine(const OpKernelType& t, std::size_t* seed) {
     OpKernelType::Hash kernel_type_hasher;
@@ -45,6 +49,65 @@ struct KernelTypePairHash {
   }
 };
 
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    auto* in_begin = in_.data<InType>();
+    auto numel = in_.numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(place);
+    if (platform::is_cpu_place(place)) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+struct CastDataLayout {
+  CastDataLayout(const framework::Tensor& in, framework::Tensor* out,
+                 const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
 using DataTransformMap =
     std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
 
diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc
index 5f05e881fa16eead1dc690f85375706bf3cd3e6d..5b01c8434b155fcb15f4bfb7d598c40bbac87d7b 100644
--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/framework/data_transform.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
@@ -31,16 +32,18 @@ using namespace platform;
  *       1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
  */
 
-std::array<proto::DataType, 2> kDataType = {
-    {proto::DataType::FP32, proto::DataType::FP64}};
+std::array<proto::DataType, 2> kDataType = {proto::DataType::FP32,
+                                            proto::DataType::FP64};
 
-std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
+std::array<Place, 2> kPlace = {CPUPlace(), CUDAPlace(0)};
 
 std::array<DataLayout, 2> kDataLayout = {
-    {DataLayout::kNHWC, DataLayout::kNCHW}};
+    DataLayout::kNHWC, DataLayout::kNCHW,
+};
 
 std::array<LibraryType, 2> kLibraryType = {
-    {LibraryType::kPlain, LibraryType::kMKLDNN}};
+    LibraryType::kPlain, LibraryType::kMKLDNN,
+};
 
 OpKernelType GenFromBit(const std::vector<bool> bits) {
   return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
@@ -54,17 +57,20 @@ auto kernel1 = GenFromBit({0, 0, 0, 1});
 auto kernel2 = GenFromBit({0, 0, 1, 0});
 auto kernel3 = GenFromBit({0, 0, 1, 1});
 
-void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransDataType_t(const platform::DeviceContext* ctx,
+                     const KernelTypePair& p, const Variable& in,
                      Variable* out) {
   test_value++;
 }
 
-void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransDataLayout_t(const platform::DeviceContext* ctx,
+                       const KernelTypePair& p, const Variable& in,
                        Variable* out) {
   test_value--;
 }
 
-void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransLibraryType_t(const platform::DeviceContext* ctx,
+                        const KernelTypePair& p, const Variable& in,
                         Variable* out) {
   test_value += 2;
 }
@@ -83,17 +89,68 @@ TEST(DataTransform, Register) {
   using namespace paddle::platform;
 
   auto& instance = DataTransformFnMap::Instance();
-  ASSERT_EQ(instance.Map().size(), 3UL);
-  DeviceContext* ctx = nullptr;
   paddle::framework::Variable in;
   paddle::framework::Variable out;
 
-  instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out);
+  DeviceContext* ctx = new CPUDeviceContext();
+  auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
+  instance.Get(pair0)(ctx, pair0, in, &out);
   ASSERT_EQ(test_value, 1);
 
-  instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out);
+  auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
+  instance.Get(pair1)(ctx, pair1, in, &out);
   ASSERT_EQ(test_value, 0);
 
-  instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out);
+  auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
+  instance.Get(pair3)(ctx, pair3, in, &out);
   ASSERT_EQ(test_value, 2);
 }
+
+TEST(DataTransform, Layout) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
+  src->set_layout(DataLayout::kNHWC);
+
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  {
+    auto kernel1 = GenFromBit({1, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 1, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.layout() != src->layout());
+}
+
+TEST(DataTransform, DataType) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    ptr[i] = i / 3;
+  }
+
+  {
+    auto kernel1 = GenFromBit({0, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.data<double>() != nullptr);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index a3ce96c409675ad52a811586c736ca22b5c7e99e..fc7091f1c89f8b3f998f6d1b68f032b76bad2197 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -461,7 +461,7 @@ void OperatorWithKernel::Run(const Scope& scope,
         dev_ctx->Wait();
 
         for (auto var_name : need_trans) {
-          (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)),
+          (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
                        scope.FindVar(var_name + framework::KernelTypeToString(
                                                     expected_kernel_key)));
         }
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index d4f12f0a106e077ac31aa37f46857b74e1e99b59..dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -245,9 +245,12 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 
-#define DEFINE_CPU_TRANS(RANK)                                        \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;
+#define DEFINE_CPU_TRANS(RANK)                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);