Feature/transform (#7111)

* "fix data transform" * "data transformer" * "add device pool" * "add test" * "fix ci" * "fix datalayout implementation " * "fix based on comment"

Feature/transform (#7111)
* "fix data transform" * "data transformer" * "add device pool" * "add test" * "fix ci" * "fix datalayout implementation " * "fix based on comment"
899a79cc · dzhwinter · GitHub · 90a33ddd · 899a79cc · 899a79cc
6 changed file
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -29,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc DEPS glog)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
-cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto)
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
 cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)

--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/framework/data_transform.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace framework {
@@ -23,5 +24,83 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
  return data_transform_map;
 }
+auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNCHW, LibraryType::kPlain);
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  auto dims = src.dims();
+  dst->Resize(dims);
+  auto dst_type = kernel_pair.second.data_type_;
+  auto src_type = kernel_pair.first.data_type_;
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+void TransDataLayout(const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+  dst->Resize(src.dims());
+  auto place = kernel_pair.second.place_;
+  CopyFrom(src, place, *ctx, dst);
+  const std::vector<int> axis = {0, 2, 3, 1};
+  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis));
+  dst->set_layout(kernel_pair.second.data_layout_);
+}
 }  // namespace framework
 }  // namespace paddle
+namespace f = paddle::framework;
+REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
+REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout);
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -21,16 +21,20 @@ limitations under the License. */
 #include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/macros.h"
+#include "paddle/platform/transform.h"
 namespace paddle {
 namespace framework {
-using DataTransformFn = std::function<void(const platform::DeviceContext* ctx,
-                                           const Variable& in, Variable* out)>;
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+using DataTransformFn =
+    std::function<void(const platform::DeviceContext*, const KernelTypePair&,
+                       const Variable&, Variable*)>;
 struct KernelTypePairHash {
  static void HashCombine(const OpKernelType& t, std::size_t* seed) {
    OpKernelType::Hash kernel_type_hasher;
@@ -45,6 +49,65 @@ struct KernelTypePairHash {
  }
 };
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  template <typename OutType>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+    auto* in_begin = in_.data<InType>();
+    auto numel = in_.numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(place);
+    if (platform::is_cpu_place(place)) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+struct CastDataLayout {
+  CastDataLayout(const framework::Tensor& in, framework::Tensor* out,
+                 const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
 using DataTransformMap =
    std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;

--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/framework/data_transform.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace framework {
@@ -31,16 +32,18 @@ using namespace platform;
 *       1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
 */
-std::array<proto::DataType, 2> kDataType = {
+std::array<proto::DataType, 2> kDataType = {proto::DataType::FP32,
-    {proto::DataType::FP32, proto::DataType::FP64}};
+                                            proto::DataType::FP64};
-std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
+std::array<Place, 2> kPlace = {CPUPlace(), CUDAPlace(0)};
 std::array<DataLayout, 2> kDataLayout = {
-    {DataLayout::kNHWC, DataLayout::kNCHW}};
+    DataLayout::kNHWC, DataLayout::kNCHW,
+};
 std::array<LibraryType, 2> kLibraryType = {
-    {LibraryType::kPlain, LibraryType::kMKLDNN}};
+    LibraryType::kPlain, LibraryType::kMKLDNN,
+};
 OpKernelType GenFromBit(const std::vector<bool> bits) {
  return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
@@ -54,17 +57,20 @@ auto kernel1 = GenFromBit({0, 0, 0, 1});
 auto kernel2 = GenFromBit({0, 0, 1, 0});
 auto kernel3 = GenFromBit({0, 0, 1, 1});
-void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransDataType_t(const platform::DeviceContext* ctx,
+                     const KernelTypePair& p, const Variable& in,
                     Variable* out) {
  test_value++;
 }
-void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransDataLayout_t(const platform::DeviceContext* ctx,
+                       const KernelTypePair& p, const Variable& in,
                       Variable* out) {
  test_value--;
 }
-void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransLibraryType_t(const platform::DeviceContext* ctx,
+                        const KernelTypePair& p, const Variable& in,
                        Variable* out) {
  test_value += 2;
 }
@@ -83,17 +89,68 @@ TEST(DataTransform, Register) {
  using namespace paddle::platform;
  auto& instance = DataTransformFnMap::Instance();
-  ASSERT_EQ(instance.Map().size(), 3UL);
-  DeviceContext* ctx = nullptr;
  paddle::framework::Variable in;
  paddle::framework::Variable out;
-  instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out);
+  DeviceContext* ctx = new CPUDeviceContext();
+  auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
+  instance.Get(pair0)(ctx, pair0, in, &out);
  ASSERT_EQ(test_value, 1);
-  instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out);
+  auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
+  instance.Get(pair1)(ctx, pair1, in, &out);
  ASSERT_EQ(test_value, 0);
-  instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out);
+  auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
+  instance.Get(pair3)(ctx, pair3, in, &out);
  ASSERT_EQ(test_value, 2);
 }
+TEST(DataTransform, Layout) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  auto& instance = DataTransformFnMap::Instance();
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
+  src->set_layout(DataLayout::kNHWC);
+  DeviceContext* ctx = new CPUDeviceContext();
+  {
+    auto kernel1 = GenFromBit({1, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 1, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.layout() != src->layout());
+}
+TEST(DataTransform, DataType) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  auto& instance = DataTransformFnMap::Instance();
+  DeviceContext* ctx = new CPUDeviceContext();
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    ptr[i] = i / 3;
+  }
+  {
+    auto kernel1 = GenFromBit({0, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.data<double>() != nullptr);
+}
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -461,7 +461,7 @@ void OperatorWithKernel::Run(const Scope& scope,
        dev_ctx->Wait();
        for (auto var_name : need_trans) {
-          (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)),
+          (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
                       scope.FindVar(var_name + framework::KernelTypeToString(
                                                    expected_kernel_key)));
        }

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -247,7 +247,10 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;
 #define DEFINE_CPU_TRANS(RANK)                                          \
  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);