Merge pull request #643 from hedaoyuan/auto_compare

Auto compare cpu and gpu function.

Merge pull request #643 from hedaoyuan/auto_compare
Auto compare cpu and gpu function.
17f7125b · hedaoyuan · GitHub · 08556209 · 11479901 · 17f7125b
9 changed file
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1584,11 +1584,6 @@ void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
  applyRow(aggregate::min(), b);
 }
-template<>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b) {
-  applyCol(aggregate::sum(), b);
-}
 template<>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
  applyCol(aggregate::max(), b);

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -1018,8 +1018,6 @@ public:
  /// calculate the minimum value of each row of the matrix b.
  void minRows(BaseMatrixT& b);
-  /// calculate the sum of each column of the matrix b.
-  void sumCols(BaseMatrixT& b);
  /// calculate the maximum value of each column of the matrix b.
  void maxCols(BaseMatrixT& b);
  /// calculate the minimum value of each column of the matrix b.

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -2,7 +2,7 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
-add_simple_unittest(test_matrix)
+add_simple_unittest(test_SparseMatrix)
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
@@ -15,3 +15,5 @@ add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
+add_simple_unittest(test_BaseMatrix)
+add_simple_unittest(test_Matrix)
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+/**
+ * This file provides a TensorCheck template function, which can be used to
+ * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
+ */
+#include <cmath>
+#include "paddle/math/Matrix.h"
+namespace autotest {
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
+class AssertEqual {
+public:
+  AssertEqual(real err = 0) : err_(err) {}
+  inline bool operator()(real a, real b) {
+    if (err_ == 0) {
+      if (a != b) {
+        return false;
+      }
+    } else {
+      if (std::fabs(a - b) > err_) {
+        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+private:
+  real err_;
+};
+template <typename Tensor>
+class CopyToCpu;
+template <>
+class CopyToCpu<CpuMatrix> {
+public:
+  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
+  const CpuMatrix& copiedArg() const { return arg_; }
+private:
+  const CpuMatrix& arg_;
+};
+template <>
+class CopyToCpu<GpuMatrix> {
+public:
+  explicit CopyToCpu(const GpuMatrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+private:
+  CpuMatrix arg_;
+};
+template <>
+class CopyToCpu<Matrix> {
+public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+private:
+  CpuMatrix arg_;
+};
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+private:
+  const CpuVectorT<T>& arg_;
+};
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+private:
+  CpuVectorT<T> arg_;
+};
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+private:
+  CpuVectorT<T> arg_;
+};
+template <typename AssertEq>
+void TensorCheck(AssertEq compare,
+                 const CpuMatrix& matrix1,
+                 const CpuMatrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (!compare(a, b)) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+template <typename AssertEq, typename Tensor1, typename Tensor2>
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, real args1, real args2) {
+  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
+                                         << ", args2 = " << args2;
+}
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
+  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
+                          << ", args2 = " << args2;
+}
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+}  // namespace autotest
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+/**
+ * This file provides a AutoCompare calss to simplify the comparison
+ * of CPU and GPU member functions.
+ *
+ * This takes two steps
+ * 1. Construct an AutoCompare object.
+ *    When constructing an AutoCompare object, you can set the err argument
+ * to specify the maximum error for CPU and GPU functions.
+ *
+ * 2. Use the template functions cmpWithArg or cmpWithoutArg.
+ * A. [cmpWithArg] Requires the caller construct the cpu arguments.
+ *
+ *  AutoCompare test;
+ *  Init Argument arg1,arg2...
+ *  test.cmpWithArg(function, arg1, arg2....)
+ *
+ * B. [cmpWithoutArg] The caller do not need construct arguments.
+ *    If matrix used in these functions arguments is the same size.
+ *    Such as the element wise function and the aggregate function
+ *    defined in the BaseMatrix.cpp.
+ *
+ *  AutoCompare test;
+ *  test.cmpWithoutArg<I...>(function, height, width)
+*/
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "TensorCheck.h"
+namespace autotest {
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
+template <typename T1, typename T2>
+class ReplaceType {
+public:
+  typedef T1 type;
+};
+template <>
+class ReplaceType<BaseMatrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+template <>
+class ReplaceType<BaseMatrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+template <>
+class ReplaceType<Matrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+template <>
+class ReplaceType<Matrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+// construct a argument
+template <typename T>
+T construct(int height, int width);
+template <>
+float construct(int height, int width) {
+  return 0.5;
+}
+template <>
+double construct(int height, int width) {
+  return 0.5;
+}
+template <>
+size_t construct(int height, int width) {
+  size_t offset = std::rand() % (height < width ? height : width);
+  return offset;
+}
+template <>
+CpuMatrix construct(int height, int width) {
+  CpuMatrix a(height, width);
+  return a;
+}
+template <>
+GpuMatrix construct(int height, int width) {
+  GpuMatrix a(height, width);
+  return a;
+}
+// init a argument
+template <typename T>
+void init(T& v) {
+  return;
+}
+template <>
+void init(CpuMatrix& v) {
+  v.randomizeUniform();
+}
+template <>
+void init(GpuMatrix& v) {
+  v.randomizeUniform();
+}
+// init a tuple which contains a set of arguments.
+template <std::size_t I = 0, typename... Args>
+inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
+    std::tuple<Args...>& t) {}
+template <std::size_t I = 0, typename... Args>
+    inline typename std::enable_if <
+    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
+  init(std::get<I>(t));
+  initTuple<I + 1>(t);
+}
+// copy a argument, copy src to dest
+template <typename T1, typename T2>
+void copy(T1& dest, T2& src) {
+  dest = src;
+}
+template <>
+void copy(GpuMatrix& dest, CpuMatrix& src) {
+  dest.copyFrom(src);
+}
+// copy a tuple, copy src to dest
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
+    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+    inline typename std::enable_if <
+    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
+                                              std::tuple<Args2...>& src) {
+  copy(std::get<I>(dest), std::get<I>(src));
+  copyTuple<I + 1>(dest, src);
+}
+// call member function
+template <typename C,
+          typename FC,
+          typename R,
+          typename... FArgs,
+          typename... Args>
+R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
+  return (obj.*f)(args...);
+}
+template <typename T>
+class ReturnType {
+public:
+  typedef T type;
+};
+template <>
+class ReturnType<CpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+template <>
+class ReturnType<CpuIVector> {
+public:
+  typedef GpuIVector type;
+};
+template <>
+class ReturnType<CpuSparseMatrix> {
+public:
+  typedef GpuSparseMatrix type;
+};
+template <typename T>
+typename ReturnType<T>::type autoArgs(T& v) {
+  return v;
+}
+template <>
+GpuMatrix autoArgs(CpuMatrix& v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+template <>
+GpuIVector autoArgs(CpuIVector& v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+class AutoCompare {
+public:
+  /**
+   * err is the allowed calculation error.
+   * The smaller the value of err,
+   * the stricter the comparison is between CPU and GPU calculations.
+   */
+  AutoCompare(size_t height, size_t width, real err = 1e-3)
+      : cpu(height, width), gpu(height, width), compare(err) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
+    static_assert(sizeof...(FArgs) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+    TensorCheck(compare, cpu, gpu);
+  }
+  template <std::size_t... I, typename C, typename R, typename... Args>
+  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
+    static_assert(sizeof...(I) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    (void)height;
+    (void)width;
+    auto tuple1 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            CpuMatrix>::type>(height, width)...);
+    auto tuple2 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            GpuMatrix>::type>(height, width)...);
+    initTuple(tuple1);
+    copyTuple(tuple2, tuple1);
+    call(cpu, f, std::get<I>(tuple1)...);
+    call(gpu, f, std::get<I>(tuple2)...);
+    TensorCheck(compare, cpu, gpu);
+  }
+protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+  AssertEqual compare;
+};
+}  // namespace autotest
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
+ * implementation of CPU and GPU member function in
+ * BaseMatrix.cpp and Matrix.cpp.
+ */
+#include <gtest/gtest.h>
+#include "paddle/math/BaseMatrix.h"
+#include "TestUtils.h"
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using autotest::AutoCompare;
+// Test all void (BaseMatrix::*)() function
+TEST(BaseMatrix, void) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)()) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg(f, height, width);
+      };
+      compare(&BaseMatrix::neg);
+      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::zero);
+      compare(&BaseMatrix::one);
+    }
+  }
+}
+// Test all void (BaseMatrix::*)(real) function
+TEST(BaseMatrix, real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::biggerThanScalar);
+      compare(&BaseMatrix::downClip);
+    }
+  }
+}
+// Test all void (BaseMatrix::*)(BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::relu);
+      compare(&BaseMatrix::reluDerivative);
+      compare(&BaseMatrix::softrelu);
+      compare(&BaseMatrix::softreluDerivative);
+      compare(&BaseMatrix::brelu);
+      compare(&BaseMatrix::breluDerivative);
+      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::squareDerivative);
+      compare(&BaseMatrix::tanh);
+      compare(&BaseMatrix::tanhDerivative);
+      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::reciprocalDerivative);
+      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::absDerivative);
+      compare(&BaseMatrix::sigmoid);
+      compare(&BaseMatrix::sigmoidDerivative);
+      compare(&BaseMatrix::expDerivative);
+      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareMul);
+      compare(&BaseMatrix::addColVector);
+      compare(&BaseMatrix::addRowVector);
+      compare(&BaseMatrix::mulRowVector);
+      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::addP2P);
+      compare(&BaseMatrix::invSqrt);
+    }
+  }
+}
+// Test all void (BaseMatrix::*)(real, real) function
+TEST(BaseMatrix, real_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::clip);
+    }
+  }
+}
+// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
+TEST(BaseMatrix, BaseMatrix_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+      compare(&BaseMatrix::addBias);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::addScalar);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::scalarDiv);
+      compare(&BaseMatrix::addSquare);
+      compare(&BaseMatrix::isEqualTo);
+    }
+  }
+}
+// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height,
+                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+      compare(&BaseMatrix::softCrossEntropy);
+      compare(&BaseMatrix::softCrossEntropyBp);
+      compare(&BaseMatrix::binaryLabelCrossEntropy);
+      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::add2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotDiv);
+      compare(&BaseMatrix::logisticRegressionLoss);
+      compare(&BaseMatrix::logisticRegressionLossBp);
+      compare(&BaseMatrix::biggerThan);
+      compare(&BaseMatrix::max);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareSquare);
+    }
+  }
+}
+void TestEelementWise(size_t height, size_t width) {
+  AutoCompare rowScale(height, width);
+  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
+  AutoCompare rowDotMul(height, width);
+  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
+  AutoCompare binaryClassificationError(height, width);
+  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
+      &BaseMatrix::binaryClassificationError, height, width);
+  AutoCompare sumOfSquaresBp(height, width);
+  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
+}
+void TestAggregateToRow(size_t height, size_t width) {
+  AutoCompare maxCols(1, width);
+  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
+  AutoCompare minCols(1, width);
+  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
+  AutoCompare addDotMulVMM(1, width);
+  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
+  AutoCompare sumCols(1, width);
+  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
+  AutoCompare collectBias(1, width);
+  collectBias.cmpWithoutArg<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
+      height,
+      width);
+}
+void TestAggregateToCol(size_t height, size_t width) {
+  AutoCompare maxRows(height, 1);
+  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
+  AutoCompare minRows(height, 1);
+  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
+  AutoCompare sumRows(height, 1);
+  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
+  AutoCompare sumOfSquares(height, 1);
+  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
+}
+TEST(BaseMatrix, Other) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      TestEelementWise(height, width);
+      TestAggregateToRow(height, width);
+      TestAggregateToCol(height, width);
+    }
+  }
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithArg to compares the
+ * implementation of CPU and GPU member function in Matrix.cpp.
+ */
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::CpuIVector;
+using paddle::CpuSparseMatrix;
+using autotest::AutoCompare;
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+  AutoCompare forward(numSamples, outWidth);
+  CpuMatrix arg1(numSamples, inWidth);
+  arg1.randomizeUniform();
+  forward.cmpWithArg(&Matrix::bilinearForward,
+                     arg1,
+                     imgSizeH,
+                     imgSizeW,
+                     2 * imgSizeH,
+                     2 * imgSizeW,
+                     channels,
+                     ratioH,
+                     ratioW);
+  AutoCompare backward(numSamples, inWidth);
+  CpuMatrix arg2(numSamples, outWidth);
+  arg2.randomizeUniform();
+  backward.cmpWithArg(&Matrix::bilinearBackward,
+                      arg2,
+                      2 * imgSizeH,
+                      2 * imgSizeW,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      ratioH,
+                      ratioW);
+}
+TEST(Matrix, BilinearFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
+        }
+      }
+    }
+  }
+}
+void testMatrixAddBias(int height, int width, real scale) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(1, width);
+  arg1.randomizeUniform();
+  test.cmpWithArg(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::addBias),
+      arg1,
+      scale);
+}
+void testMatrixAddDotMulMMV(int height, int width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(1, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  test.cmpWithArg(&BaseMatrix::addDotMulMMV, arg1, arg2);
+}
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testMatrixAddBias(height, width, 1.0);
+      testMatrixAddBias(height, width, 3.5);
+      testMatrixAddDotMulMMV(height, width);
+    }
+  }
+}
+void testMatrixAddAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addAtOffset, arg1, offset);
+}
+void testMatrixAssignAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::assignAtOffset, arg1, offset);
+}
+TEST(Matrix, AtOffset) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width1 : {1, 32, 100, 512, 1000}) {
+      for (auto width2 : {1, 32, 100, 512, 1000}) {
+        int columnOffset = 0;
+        int offset = std::abs(width1 - width2);
+        if (offset) {
+          columnOffset = std::rand() % offset;
+        }
+        VLOG(3) << " height=" << height << " width1=" << width1
+                << " width2=" << width2 << " columnOffset = " << columnOffset;
+        testMatrixAddAtOffset(height, width1, width2, columnOffset);
+        testMatrixAssignAtOffset(height, width1, width2, columnOffset);
+      }
+    }
+  }
+}
+void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
+  AutoCompare test(numSamples, inputDim);
+  CpuMatrix arg1(tableSize, inputDim);
+  CpuIVector arg2(numSamples);
+  arg1.randomizeUniform();
+  arg2.rand(tableSize);
+  test.cmpWithArg(&Matrix::selectRows, arg1, arg2);
+}
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixSelectRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
+  AutoCompare test(outHeight, width);
+  CpuMatrix arg1(inHeight, width);
+  CpuIVector arg2(outHeight);
+  arg1.randomizeUniform();
+  arg2.rand(inHeight);
+  test.cmpWithArg(&Matrix::copyByRowIndex, arg1, arg2);
+}
+TEST(Matrix, copyByRowIndex) {
+  for (auto outHeight : {31, 500, 1000}) {
+    for (auto inHeight : {17, 257, 500, 1200}) {
+      for (auto width : {512, 1024}) {
+        VLOG(3) << outHeight << " " << inHeight << " " << width;
+        testMatrixCopyByRowIndex(outHeight, inHeight, width);
+      }
+    }
+  }
+}
+void testCosSim(int heightX, int heightY, int width, real scale) {
+  AutoCompare test(heightX, 1);
+  CpuMatrix arg1(heightX, width);
+  CpuMatrix arg2(heightY, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
+}
+TEST(Matrix, cosSim) {
+  for (auto heightX : {10, 100, 1000}) {
+    for (auto heightY : {1, heightX}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSim(heightX, heightY, width, scale);
+        }
+      }
+    }
+  }
+}
+void testParamReluForward(int height, int width, int w_height, int w_width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(w_height, w_width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg1.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluForward, arg1, arg2);
+}
+void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
+  AutoCompare test(w_height, w_width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(height, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluBackwardW, arg1, arg2);
+}
+TEST(Matrix, paramRelu) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluForward(height, width, w_height, w_width);
+          testParamReluBackwardW(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(numSamples, dim);
+  CpuMatrix arg1(1, channel);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addSharedBias, arg1, 1.0);
+}
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(1, channel);
+  CpuMatrix arg1(numSamples, dim);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::collectSharedBias, arg1, 1.0);
+}
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
+  AutoCompare forward(numSamples, 1);
+  CpuMatrix arg1(numSamples, dim);
+  CpuSparseMatrix arg2(
+      numSamples, dim, numSamples, paddle::NO_VALUE, paddle::SPARSE_CSR);
+  CpuMatrix output1(numSamples, dim);
+  output1.randomizeUniform();
+  output1.softmax(arg1);
+  for (int i = 0; i < numSamples; i++) {
+    const unsigned int id = std::rand() % dim;
+    arg2.setRow(i, 1, &id, nullptr);
+  }
+  forward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropy, arg1, arg2);
+  AutoCompare backward(numSamples, dim);
+  backward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropyBp, arg1, arg2);
+}
+TEST(Matrix, multiBinaryCrossEntropy) {
+  for (auto numSamples : {100, 1000, 10000}) {
+    for (auto dim : {100, 1000, 10000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testMultiBinaryLabelCrossEntropy(numSamples, dim);
+    }
+  }
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif
--- a/paddle/math/tests/test_matrix.cpp
+++ b/paddle/math/tests/test_matrix.cpp
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp