Speed up ColwiseSum in CPU (#6834)

* Remove unnecessary reshape in ColwiseSum Speed up 12s -> 10s. * Hand write ColwiseAdd in CPU

Speed up ColwiseSum in CPU (#6834)
* Remove unnecessary reshape in ColwiseSum Speed up 12s -> 10s. * Hand write ColwiseAdd in CPU
7e214b49 · Yu Yang · GitHub · 1a3d4b0d · 7e214b49
显示空白变更内容
内联并排

Showing with 33 addition and 6 deletion

paddle/operators/math/math_function_impl.h paddle/operators/math/math_function_impl.h +33 -6

未找到文件。
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -67,18 +67,45 @@ void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename DeviceContext, typename T>
 void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const framework::Tensor& input,
-                                              framework::Tensor* vector) {
+                                              framework::Tensor* out) {
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size);
-  auto vec = framework::EigenMatrix<T>::From(*vector);
  auto in = framework::EigenMatrix<T>::From(input);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
+  auto vec = framework::EigenVector<T>::Flatten(*out);
-  vec.reshape(shape).device(*context.eigen_device()) =
-      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+    for (size_t i = 0; i < height; ++i) {
+      for (size_t j = 0; j < size; ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle