Merge pull request #3050 from QiJune/op_gpu_test

enable operator gpu unittest

Merge pull request #3050 from QiJune/op_gpu_test
enable operator gpu unittest
6824c09d · QI JUN · GitHub · bfaea910 · 043e983b · 6824c09d
20 changed file
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
+        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+        # Use Debug mode instead for now.
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
+            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.

--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/memory/memcpy.h"
 namespace paddle {
@@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size));
+    } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
    }
-#ifndef PADDLE_ONLY_CPU
+#else
-    else if (platform::is_gpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
          boost::get<platform::GPUPlace>(place), size));
    }

--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
+#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"

--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
+#define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,

--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -12,6 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
+#define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
+#define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"
 REGISTER_OP_GPU_KERNEL(rowwise_add,

--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
+#define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
 REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
+#define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"
 REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
+#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"

--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -148,7 +148,7 @@ inline void throw_on_error(T e) {
  do {                                                                 \
    throw ::paddle::platform::EnforceNotMet(                           \
        std::make_exception_ptr(                                       \
-            std::runtime_error(string::Sprintf(__VA_ARGS__))), \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
        __FILE__, __LINE__);                                           \
  } while (0)

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
 #include "paddle/pybind/tensor_bind.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -55,6 +57,14 @@ static size_t UniqueIntegerGenerator() {
  return generator.fetch_add(1);
 }
+bool IsCompileGPU() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of PaddlePaddle");
@@ -69,15 +79,27 @@ PYBIND11_PLUGIN(core) {
             self.Resize(pd::make_ddim(dim));
           })
      .def("alloc_float",
-           [](pd::Tensor& self) {
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
-             self.mutable_data<float>(paddle::platform::CPUPlace());
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<int>(place);
           })
      .def("alloc_int",
-           [](pd::Tensor& self) {
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
-             self.mutable_data<int>(paddle::platform::CPUPlace());
+             self.mutable_data<int>(place);
           })
-      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyTensorSetFromArray<int>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
+#ifndef PADDLE_ONLY_CPU
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
+#endif
      .def("shape",
           [](pd::Tensor& self) { return pd::vectorize(self.dims()); });
@@ -136,11 +158,27 @@ All parameter, weight, gradient are variables in Paddle.
       "The module will return special predefined variable name in Paddle")
      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+  // clang-format off
  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("cpu_context", []() -> paddle::platform::DeviceContext* {
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::GPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifdef PADDLE_ONLY_CPU
+                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
                  });
+  // clang-format on
+  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
      m, "Operator");
@@ -176,5 +214,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("unique_integer", UniqueIntegerGenerator);
+  m.def("is_compile_gpu", IsCompileGPU);
  return m.ptr();
 }
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
@@ -13,9 +13,11 @@
   limitations under the License. */
 #pragma once
-#include <paddle/framework/tensor.h>
+#include <string>
-#include <pybind11/numpy.h>
+#include "paddle/framework/tensor.h"
-#include <pybind11/pybind11.h>
+#include "paddle/memory/memcpy.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
 namespace py = pybind11;
@@ -40,9 +42,6 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
-                   "Only CPU tensor can cast to numpy array");
    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
@@ -56,12 +55,17 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
+      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+        dst_tensor = tensor;
+      }
      return py::buffer_info(
-          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
          sizeof(CUR_TYPE),
          py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(tensor.dims()),
+          (size_t)framework::arity(dst_tensor.dims()),
          dims_outside,
          strides);
    } else {
@@ -77,9 +81,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }
 template <typename T>
-void PyTensorSetFromArray(
+void PyCPUTensorSetFromArray(
    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
  std::vector<int> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
@@ -87,9 +92,28 @@ void PyTensorSetFromArray(
  }
  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  auto *dst = self.mutable_data<T>(place);
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::GPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  paddle::platform::GpuMemcpySync(
+      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
+}
+#endif
 }  // namespace pybind
 }  // namespace paddle
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -8,7 +8,6 @@ add_python_test(test_framework
    test_fc_op.py
    test_add_two_op.py
    test_sgd_op.py
-    test_cross_entropy_op.py
    test_mul_op.py
    test_mean_op.py
    test_sigmoid_op.py

--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -26,14 +26,19 @@ class OpTestMeta(type):
            scope = core.Scope()
            kwargs = dict()
+            places = []
+            places.append(core.CPUPlace())
+            if core.is_compile_gpu():
+                places.append(core.GPUPlace(0))
+            for place in places:
                for in_name in func.all_input_args:
                    if hasattr(self, in_name):
                        kwargs[in_name] = in_name
                        var = scope.new_var(in_name).get_tensor()
                        arr = getattr(self, in_name)
                        var.set_dims(arr.shape)
-                    var.set(arr)
+                        var.set(arr, place)
                    else:
                        kwargs[in_name] = "@EMPTY@"
@@ -50,7 +55,7 @@ class OpTestMeta(type):
                op.infer_shape(scope)
-            ctx = core.DeviceContext.cpu_context()
+                ctx = core.DeviceContext.create(place)
                op.run(scope, ctx)
                for out_name in func.all_output_args:

--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -8,8 +8,8 @@ class TestAddOp(unittest.TestCase):
    def setUp(self):
        self.type = "add_two"
-        self.X = numpy.random.random((342, 345)).astype("float32")
+        self.X = numpy.random.random((102, 105)).astype("float32")
-        self.Y = numpy.random.random((342, 345)).astype("float32")
+        self.Y = numpy.random.random((102, 105)).astype("float32")
        self.Out = self.X + self.Y

--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
@@ -7,17 +7,19 @@ import paddle.v2.framework.create_op_creation_methods as creation
 class TestFc(unittest.TestCase):
    def test_fc(self):
        scope = core.Scope()
+        place = core.CPUPlace()
        x = scope.new_var("X")
        x_tensor = x.get_tensor()
        x_tensor.set_dims([1000, 784])
-        x_tensor.alloc_float()
+        x_tensor.alloc_float(place)
        w = scope.new_var("W")
        w_tensor = w.get_tensor()
        w_tensor.set_dims([784, 100])
-        w_tensor.alloc_float()
+        w_tensor.alloc_float(place)
-        w_tensor.set(numpy.random.random((784, 100)).astype("float32"))
+        w_tensor.set(numpy.random.random((784, 100)).astype("float32"), place)
        # Set a real numpy array here.
        # x_tensor.set(numpy.array([]))
@@ -32,7 +34,7 @@ class TestFc(unittest.TestCase):
        op.infer_shape(scope)
        self.assertEqual([1000, 100], tensor.shape())
-        ctx = core.DeviceContext.cpu_context()
+        ctx = core.DeviceContext.create(place)
        op.run(scope, ctx)

--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -8,8 +8,8 @@ class TestMulOp(unittest.TestCase):
    def setUp(self):
        self.type = "mul"
-        self.X = np.random.random((32, 784)).astype("float32")
+        self.X = np.random.random((32, 84)).astype("float32")
-        self.Y = np.random.random((784, 100)).astype("float32")
+        self.Y = np.random.random((84, 100)).astype("float32")
        self.Out = np.dot(self.X, self.Y)

--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -8,8 +8,8 @@ class TestRowwiseAddOp(unittest.TestCase):
    def setUp(self):
        self.type = "rowwise_add"
-        self.X = np.random.random((32, 784)).astype("float32")
+        self.X = np.random.random((32, 84)).astype("float32")
-        self.b = np.random.random(784).astype("float32")
+        self.b = np.random.random(84).astype("float32")
        self.Out = np.add(self.X, self.b)

--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -8,8 +8,8 @@ class TestSGD(unittest.TestCase):
    def setUp(self):
        self.type = "sgd"
-        self.param = numpy.random.random((342, 345)).astype("float32")
+        self.param = numpy.random.random((102, 105)).astype("float32")
-        self.grad = numpy.random.random((342, 345)).astype("float32")
+        self.grad = numpy.random.random((102, 105)).astype("float32")
        self.learning_rate = 0.1
        self.param_out = self.param - self.learning_rate * self.grad

--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -7,16 +7,17 @@ class TestScope(unittest.TestCase):
    def test_int_tensor(self):
        scope = core.Scope()
        var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
        tensor = var.get_tensor()
        tensor.set_dims([1000, 784])
-        tensor.alloc_int()
+        tensor.alloc_int(place)
        tensor_array = numpy.array(tensor)
        self.assertEqual((1000, 784), tensor_array.shape)
        tensor_array[3, 9] = 1
        tensor_array[19, 11] = 2
-        tensor.set(tensor_array)
+        tensor.set(tensor_array, place)
        tensor_array_2 = numpy.array(tensor)
        self.assertEqual(1.0, tensor_array_2[3, 9])
@@ -25,16 +26,18 @@ class TestScope(unittest.TestCase):
    def test_float_tensor(self):
        scope = core.Scope()
        var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
        tensor = var.get_tensor()
        tensor.set_dims([1000, 784])
-        tensor.alloc_float()
+        tensor.alloc_float(place)
        tensor_array = numpy.array(tensor)
        self.assertEqual((1000, 784), tensor_array.shape)
        tensor_array[3, 9] = 1.0
        tensor_array[19, 11] = 2.0
-        tensor.set(tensor_array)
+        tensor.set(tensor_array, place)
        tensor_array_2 = numpy.array(tensor)
        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])