diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index beee9e7c0fbcad0784c21f463ad6d74f41dd4165..796f2fce6428edc55e745e5977df022973237a38 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cmath>
-#include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
 
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
 
 namespace autotest {
 
@@ -71,6 +74,53 @@ private:
   CpuMatrix arg_;
 };
 
+template <>
+class CopyToCpu<Matrix> {
+public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+private:
+  CpuMatrix arg_;
+};
+
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+
+private:
+  const CpuVectorT<T>& arg_;
+};
+
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
 template <typename AssertEq>
 void TensorCheck(AssertEq compare,
                  const CpuMatrix& matrix1,
@@ -95,10 +145,30 @@ void TensorCheck(AssertEq compare,
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
 }
 
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
 template <typename AssertEq, typename Tensor1, typename Tensor2>
-extern void TensorCheck(AssertEq compare,
-                        const Tensor1& tensor1,
-                        const Tensor2& tensor2) {
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
   TensorCheck(compare,
               CopyToCpu<Tensor1>(tensor1).copiedArg(),
               CopyToCpu<Tensor2>(tensor2).copiedArg());
@@ -116,4 +186,24 @@ void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
                           << ", args2 = " << args2;
 }
 
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
 }  // namespace autotest
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index 324ecf801783491a60d8c7ed8c5c80ee17e726e7..fe78f7bf09b1949f1491719483e5238d5409903b 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -14,21 +14,19 @@ limitations under the License. */
 
 /**
  * TestUtils.h is used to automatically compare CPU and GPU code is consistent.
- *
- * Auto compare BaseMatrix member function:
- * Use case:
- * a. void BaseMatrix::tanh(BaseMatrixT& b);
- *   Compare method: BaseMatrixCompare<0>(&BaseMatrix::tanh);
- *
- * b.
- *
+ * Refer test_Matrix.cpp and test_BaseMatrix.cpp for how to use autotest.
 */
 
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
 #include "TensorCheck.h"
 
-using namespace paddle;  // NOLINT
+using paddle::BaseMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
 
 namespace autotest {
 
@@ -196,9 +194,7 @@ template <bool AsRowVector,
           typename R,
           typename... Args,
           typename AssertEq>
-void BaseMatrixCompare(R (C::*f)(Args...),
-                       AssertEq compare,
-                       bool checkArgs = false) {
+void BaseMatrixCompare(R (C::*f)(Args...), AssertEq compare) {
   for (auto height : {1, 11, 73, 128, 200, 330}) {
     for (auto width : {1, 3, 32, 100, 512, 1000}) {
       CpuMatrix obj1(AsRowVector ? 1 : height, AsColVector ? 1 : width);
@@ -227,17 +223,91 @@ void BaseMatrixCompare(R (C::*f)(Args...),
       call(obj2, f, std::get<I>(tuple2)...);
 
       TensorCheck(compare, obj1, obj2);
-      if (checkArgs) {
-        checkTuple(tuple1, tuple2, compare);
-      }
     }
   }
 }
 
+template <typename T>
+class ReturnType {
+public:
+  typedef T type;
+};
+
+template <>
+class ReturnType<CpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReturnType<CpuIVector> {
+public:
+  typedef GpuIVector type;
+};
+
+template <>
+class ReturnType<CpuSparseMatrix> {
+public:
+  typedef GpuSparseMatrix type;
+};
+
+template <typename T>
+typename ReturnType<T>::type autoArgs(T v) {
+  return v;
+}
+
+template <>
+GpuMatrix autoArgs(CpuMatrix v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuIVector autoArgs(CpuIVector v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+
+class AutoCompare {
+public:
+  AutoCompare(size_t height, size_t width)
+      : cpu(height, width), gpu(height, width) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void operator()(R (C::*f)(FArgs...), Args&&... args) {
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+
+    TensorCheckErr(cpu, gpu);
+  }
+
+protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+};
+
 }  // namespace autotest
 
 template <std::size_t... I, typename C, typename R, typename... Args>
-void BaseMatrixCompare(R (C::*f)(Args...), bool checkArgs = false) {
+void BaseMatrixCompare(R (C::*f)(Args...)) {
   static_assert(sizeof...(I) == sizeof...(Args),
                 "size of parameter packs are not equal");
 
@@ -247,7 +317,7 @@ void BaseMatrixCompare(R (C::*f)(Args...), bool checkArgs = false) {
   autotest::AssertEqual compare(1e-10);
 #endif
 
-  autotest::BaseMatrixCompare<false, false, I...>(f, compare, checkArgs);
+  autotest::BaseMatrixCompare<false, false, I...>(f, compare);
 }
 
 template <std::size_t... I, typename C, typename R, typename... Args>
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 1d334135a0cadf4438067cc29f8714e852202d62..c68080057c31cc099b3cad79198862c594deb64a 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 /**
  * This test file compares the implementation of CPU and GPU function
- * in BaseMatrix.cpp.
+ * in BaseMatrix.cpp or Matrix.cpp.
  */
 
 #include <gtest/gtest.h>
@@ -188,17 +188,22 @@ TEST(BaseMatrix, Other) {
   BaseMatrixCompare<0, 1, 2>(&BaseMatrix::rowScale);
   BaseMatrixCompare<0, 1, 2>(&BaseMatrix::rowDotMul);
   BaseMatrixCompare<0, 1, 2, 3>(&BaseMatrix::binaryClassificationError);
+
+  BaseMatrixCompare<0, 1>(&Matrix::sumOfSquaresBp);
 }
 
 TEST(BaseMatrix, Aggregate) {
   BaseMatrixAsColVector<0>(&BaseMatrix::maxRows);
   BaseMatrixAsColVector<0>(&BaseMatrix::minRows);
   BaseMatrixAsColVector<0, 1, 2>(&BaseMatrix::sumRows);
+  BaseMatrixAsColVector<0, 1>(&Matrix::sumOfSquares);
 
   BaseMatrixAsRowVector<0>(&BaseMatrix::maxCols);
   BaseMatrixAsRowVector<0>(&BaseMatrix::minCols);
   BaseMatrixAsRowVector<0, 1>(&BaseMatrix::addDotMulVMM);
   BaseMatrixAsRowVector<0, 1, 2>(&BaseMatrix::sumCols);
+  BaseMatrixAsRowVector<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias));
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 485e702a66f05a472919297b01500c94ef9f36b5..b766e5ebe27f087108b65235999de78aa4539d5a 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -14,25 +14,295 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 /**
- * This test file compares the implementation of CPU and GPU function
- * in Matrix.cpp.
+ * This test file use AutoCompare to compares the implementation
+ * of CPU and GPU member function in Matrix.cpp.
+ *
+ * 1. Constructs an AutoCompare object, a AutoCompare object contains
+ *    a CpuMatrix and a GpuMatrix;
+ * 2. Initializes the required parameters for the member function.
+ *    Only need to initialize the CPU parameters.
+ * 3. Use the operator() template for testing. In the operator() will call back
+ *    member functions, and compare the results.
+ *
+ * use case:
+ *  AutoCompare test(...);
+ *  Init Argument arg1,arg2...
+ *  test(function, arg1, arg2....)
+ *
  */
 
 #include <gtest/gtest.h>
 #include "TestUtils.h"
 
-using namespace paddle;  // NOLINT
+using paddle::CpuMatrix;
+using paddle::SparseValueType;
+using paddle::SparseFormat;
+using paddle::NO_VALUE;
+using paddle::SPARSE_CSR;
+using paddle::initMain;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+using autotest::AutoCompare;
 
-TEST(Matrix, Matrix) {
-  BaseMatrixCompare<0>(&Matrix::softmax, true);
-  BaseMatrixCompare<0, 1>(&Matrix::sumOfSquaresBp);
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  AutoCompare forward(numSamples, outWidth);
+  CpuMatrix arg1(numSamples, inWidth);
+  arg1.randomizeUniform();
+  forward(&Matrix::bilinearForward,
+          arg1,
+          imgSizeH,
+          imgSizeW,
+          2 * imgSizeH,
+          2 * imgSizeW,
+          channels,
+          ratioH,
+          ratioW);
+
+  AutoCompare backward(numSamples, inWidth);
+  CpuMatrix arg2(numSamples, outWidth);
+  arg2.randomizeUniform();
+  backward(&Matrix::bilinearBackward,
+           arg2,
+           2 * imgSizeH,
+           2 * imgSizeW,
+           imgSizeH,
+           imgSizeW,
+           channels,
+           ratioH,
+           ratioW);
+}
+
+TEST(Matrix, BilinearFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixAddBias(int height, int width, real scale) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(1, width);
+  arg1.randomizeUniform();
+  test(static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::addBias),
+       arg1,
+       scale);
+}
+
+void testMatrixAddDotMulMMV(int height, int width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(1, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  test(&BaseMatrix::addDotMulMMV, arg1, arg2);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testMatrixAddBias(height, width, 1.0);
+      testMatrixAddBias(height, width, 3.5);
+      testMatrixAddDotMulMMV(height, width);
+    }
+  }
+}
+
+void testMatrixAddAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test(&Matrix::addAtOffset, arg1, offset);
+}
+
+void testMatrixAssignAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test(&Matrix::assignAtOffset, arg1, offset);
+}
+
+TEST(Matrix, AtOffset) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width1 : {1, 32, 100, 512, 1000}) {
+      for (auto width2 : {1, 32, 100, 512, 1000}) {
+        int columnOffset = 0;
+        int offset = std::abs(width1 - width2);
+        if (offset) {
+          columnOffset = std::rand() % offset;
+        }
+        VLOG(3) << " height=" << height << " width1=" << width1
+                << " width2=" << width2 << " columnOffset = " << columnOffset;
+        testMatrixAddAtOffset(height, width1, width2, columnOffset);
+        testMatrixAssignAtOffset(height, width1, width2, columnOffset);
+      }
+    }
+  }
+}
+
+void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
+  AutoCompare test(numSamples, inputDim);
+  CpuMatrix arg1(tableSize, inputDim);
+  CpuIVector arg2(numSamples);
+  arg1.randomizeUniform();
+  arg2.rand(tableSize);
+  test(&Matrix::selectRows, arg1, arg2);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixSelectRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
+  AutoCompare test(outHeight, width);
+  CpuMatrix arg1(inHeight, width);
+  CpuIVector arg2(outHeight);
+  arg1.randomizeUniform();
+  arg2.rand(inHeight);
+  test(&Matrix::copyByRowIndex, arg1, arg2);
 }
 
-TEST(Matrix, Aggregate) {
-  BaseMatrixAsRowVector<0, 1>(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias));
+TEST(Matrix, copyByRowIndex) {
+  for (auto outHeight : {31, 500, 1000}) {
+    for (auto inHeight : {17, 257, 500, 1200}) {
+      for (auto width : {512, 1024}) {
+        VLOG(3) << outHeight << " " << inHeight << " " << width;
+        testMatrixCopyByRowIndex(outHeight, inHeight, width);
+      }
+    }
+  }
+}
+
+void testCosSim(int heightX, int heightY, int width, real scale) {
+  AutoCompare test(heightX, 1);
+  CpuMatrix arg1(heightX, width);
+  CpuMatrix arg2(heightY, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test(&Matrix::cosSim, arg1, arg2, scale);
+}
+
+TEST(Matrix, cosSim) {
+  for (auto heightX : {10, 100, 1000}) {
+    for (auto heightY : {1, heightX}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSim(heightX, heightY, width, scale);
+        }
+      }
+    }
+  }
+}
+
+void testParamReluForward(int height, int width, int w_height, int w_width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(w_height, w_width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg1.add(-0.5);
+  test(&Matrix::paramReluForward, arg1, arg2);
+}
+
+void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
+  AutoCompare test(w_height, w_width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(height, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test(&Matrix::paramReluBackwardW, arg1, arg2);
+}
+
+TEST(Matrix, paramRelu) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluForward(height, width, w_height, w_width);
+          testParamReluBackwardW(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(numSamples, dim);
+  CpuMatrix arg1(1, channel);
+  arg1.randomizeUniform();
+  test(&Matrix::addSharedBias, arg1, 1.0);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(1, channel);
+  CpuMatrix arg1(numSamples, dim);
+  arg1.randomizeUniform();
+  test(&Matrix::collectSharedBias, arg1, 1.0);
+}
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
+void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
+  AutoCompare forward(numSamples, 1);
+  CpuMatrix arg1(numSamples, dim);
+  CpuSparseMatrix arg2(numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR);
+
+  CpuMatrix output1(numSamples, dim);
+  output1.randomizeUniform();
+  output1.softmax(arg1);
+  for (int i = 0; i < numSamples; i++) {
+    const unsigned int id = std::rand() % dim;
+    arg2.setRow(i, 1, &id, nullptr);
+  }
+  forward(&Matrix::multiBinaryLabelCrossEntropy, arg1, arg2);
+
+  AutoCompare backward(numSamples, dim);
+  backward(&Matrix::multiBinaryLabelCrossEntropyBp, arg1, arg2);
+}
 
-  BaseMatrixAsColVector<0, 1>(&Matrix::sumOfSquares);
+TEST(Matrix, multiBinaryCrossEntropy) {
+  for (auto numSamples : {100, 1000, 10000}) {
+    for (auto dim : {100, 1000, 10000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testMultiBinaryLabelCrossEntropy(numSamples, dim);
+    }
+  }
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 86a4a0e5ec5a046c7339644280d7f67400485569..4895583d32675214e5f14583a2c2b7a1ca075558 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -22,163 +22,12 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/utils/Stat.h"
+#include "TensorCheck.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-
-template <class T>
-void VectorCheckEqual(const VectorT<T>& vector1, const VectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] != data2[i]) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckEqual(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (data1[i * width + j] != data2[i * width + j]) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->bilinearForward(*input,
-                          imgSizeH,
-                          imgSizeW,
-                          2 * imgSizeH,
-                          2 * imgSizeW,
-                          channels,
-                          ratioH,
-                          ratioW);
-  targetGpu->bilinearForward(*inputGpu,
-                             imgSizeH,
-                             imgSizeW,
-                             2 * imgSizeH,
-                             2 * imgSizeW,
-                             channels,
-                             ratioH,
-                             ratioW);
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Matrix, BilinearFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
-        }
-      }
-    }
-  }
-}
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
 
 void testMatrixProjectionForward(int contextStart,
                                  int contextLength,
@@ -232,12 +81,7 @@ void testMatrixProjectionForward(int contextStart,
                                       beginPad,
                                       padding);
 
-  // check
-  MatrixPtr outputCheck =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  outputCheck->copyFrom(*gpuOutput);
-
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
 }
 
 void testMatrixProjectionBackward(int contextStart,
@@ -294,15 +138,9 @@ void testMatrixProjectionBackward(int contextStart,
                                                    beginPad);
   }
 
-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckErr(*cpuInputGrad, *inputGradCheck);
-
+  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
   if (padding) {
-    MatrixPtr weightGradChcek = std::make_shared<CpuMatrix>(pad, inputDim);
-    weightGradChcek->copyFrom(*gpuWeightGrad);
-    MatrixCheckErr(*cpuWeightGrad, *weightGradChcek);
+    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
   }
 }
 
@@ -361,15 +199,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
   cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
   gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-
-  IVectorPtr indexCheck = nullptr;
-  IVector::resizeOrCreate(indexCheck, newBatchSize * inputDim, false);
-  indexCheck->copyFrom(*gpuIndex);
-  VectorCheckEqual(*cpuIndex, *indexCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
+  TensorCheckEqual(*cpuIndex, *gpuIndex);
 
   // backward
   MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
@@ -385,10 +216,7 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
   cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
   gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
 
-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckEqual(*cpuInputGrad, *inputGradCheck);
+  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
 }
 
 TEST(Matrix, maxSequence) {
@@ -431,6 +259,8 @@ void testMatrixZeroAtOffset(int height, int width) {
   int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
   int numColumns = rand() % (width - columnOffset);  // NOLINT
 
+  if (numColumns == 0) return;
+
   cpuA->zeroAtOffset(columnOffset, numColumns);
   gpuA->zeroAtOffset(columnOffset, numColumns);
 
@@ -442,61 +272,8 @@ void testMatrixZeroAtOffset(int height, int width) {
     }
   }
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixAddBias(int height, int width, real scale) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixAddDotMulMMV(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(1, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(1, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  cpuA->addDotMulMMV(*cpuB, *cpuC);
-  gpuA->addDotMulMMV(*gpuB, *gpuC);
-  cpuA1->addDotMulMMV2(*cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuA1);
+  TensorCheckEqual(*cpuA, *gpuA);
+  TensorCheckEqual(*cpuA, *cpuTest);
 }
 
 void testMatrixTranspose(int height, int width) {
@@ -510,9 +287,7 @@ void testMatrixTranspose(int height, int width) {
   cpu->transpose(cpuT, false);
   gpu->transpose(gpuT, false);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(width, height);
-  outputCheck->copyFrom(*gpuT);
-  MatrixCheckEqual(*cpuT, *outputCheck);
+  TensorCheckEqual(*cpuT, *gpuT);
 }
 
 void testMatrixInverse(int height) {
@@ -533,12 +308,11 @@ void testMatrixInverse(int height) {
   cpu->inverse(cpuI, false);
   gpu->inverse(gpuI, false);
 
-  outputCheck->copyFrom(*gpuI);
-  MatrixCheckErr(*cpuI, *outputCheck);
+  TensorCheckErr(*cpuI, *gpuI);
 
   outputCheck->mul(cpu, cpuI);
   cpu->setDiag(1.0);
-  MatrixCheckErr(*cpu, *outputCheck);
+  TensorCheckErr(*cpu, *outputCheck);
 }
 
 TEST(Matrix, unary) {
@@ -546,15 +320,8 @@ TEST(Matrix, unary) {
     for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
       VLOG(3) << " height=" << height << " width=" << width;
 
-      // asRowVector
-      testMatrixAddBias(height, width, 1.0);
-      testMatrixAddBias(height, width, 3.5);
-      testMatrixAddDotMulMMV(height, width);
-
-      // sum
+      testMatrixZeroAtOffset(height, width);
       testMatrixGetSum(height, width);
-
-      // transpose
       testMatrixTranspose(height, width);
     }
     // inverse
@@ -562,6 +329,22 @@ TEST(Matrix, unary) {
   }
 }
 
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
 void testSequenceSoftmax(int batchSize) {
   // forward
   int inputDim = 1;
@@ -578,10 +361,7 @@ void testSequenceSoftmax(int batchSize) {
   cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
   gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  outputCheck->copyFrom(*gpuInput);
-  MatrixCheckErr(*cpuInput, *outputCheck);
+  TensorCheckErr(*cpuInput, *gpuInput);
 }
 
 void testMatrixSoftmaxThreshold(int height, int width) {
@@ -634,9 +414,7 @@ void testMatrixSoftmaxBp(int height, int width) {
   sftMaxSum->colMerge(*sftMaxDot);
   cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }
 
 TEST(Matrix, softmax) {
@@ -644,6 +422,7 @@ TEST(Matrix, softmax) {
     for (auto width : {1, 32, 100, 512, 1000}) {
       VLOG(3) << " height=" << height << " width=" << width;
 
+      testMatrixSoftmax(height, width);
       testMatrixSoftmaxBp(height, width);
       testMatrixSoftmaxThreshold(height, width);
     }
@@ -651,95 +430,6 @@ TEST(Matrix, softmax) {
   }
 }
 
-void testMatrixAddAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->addAtOffset(*cpuInput, columnOffset);
-  gpuOutput->addAtOffset(*gpuInput, columnOffset);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-}
-
-void testMatrixAssignAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->assignAtOffset(*cpuInput, columnOffset);
-  gpuOutput->assignAtOffset(*gpuInput, columnOffset);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, AtOffset) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width1 : {1, 32, 100, 512, 1000}) {
-      for (auto width2 : {1, 32, 100, 512, 1000}) {
-        VLOG(3) << " height=" << height << " width1=" << width1
-                << " width2=" << width2;
-
-        testMatrixAddAtOffset(height, width1, width2);
-        testMatrixAssignAtOffset(height, width1, width2);
-      }
-    }
-  }
-}
-
-void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuOutput->selectRows(*cpuTable, *cpuIds);
-  gpuOutput->selectRows(*gpuTable, *gpuIds);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-}
-
 void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
   MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
   MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
@@ -761,10 +451,7 @@ void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
   cpuOutput->addToRows(*cpuTable, *cpuIds);
   gpuOutput->addToRows(*gpuTable, *gpuIds);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  outputCheck->copyFrom(*gpuTable);
-  MatrixCheckErr(*cpuTable, *outputCheck);
+  TensorCheckErr(*cpuTable, *gpuTable);
 }
 
 TEST(Matrix, tableProjection) {
@@ -773,7 +460,6 @@ TEST(Matrix, tableProjection) {
       for (auto inputDim : {20, 50}) {
         VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
                 << " inputDim=" << inputDim;
-        testMatrixSelectRows(numSamples, tableSize, inputDim);
         testMatrixAddToRows(numSamples, tableSize, inputDim);
       }
     }
@@ -807,9 +493,7 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   cpuC->mul(cpuA, cpuB, alpha, beta);
   gpuC->mul(gpuA, gpuB, alpha, beta);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }
 
 void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
@@ -881,9 +565,7 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   subCpuC->mul(subCpuA, subCpuB, alpha, beta);
   subGpuC->mul(subGpuA, subGpuB, alpha, beta);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }
 
 TEST(Matrix, mul) {
@@ -937,9 +619,7 @@ void testVectorReset(int size) {
   cpu->reset(value);
   gpu->reset(value);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }
 
 template <class T>
@@ -965,9 +645,7 @@ void testVecortSelectFrom(int size) {
   cpuDst->selectFrom(*cpuSrc, *cpuIds);
   gpuDst->selectFrom(*gpuSrc, *gpuIds);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuDst);
-  VectorCheckEqual(*cpuDst, *out);
+  TensorCheckEqual(*cpuDst, *gpuDst);
 }
 
 template <class T>
@@ -978,9 +656,7 @@ void testVecotrZeroMem(int size) {
   cpu->zeroMem();
   gpu->zeroMem();
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }
 
 template <class T>
@@ -1001,9 +677,7 @@ void testVectorIsEqual(int size) {
   cpuA->isEqualTo(*cpuB, value);
   gpuA->isEqualTo(*gpuB, value);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuA);
-  VectorCheckEqual(*cpuA, *out);
+  TensorCheckEqual(*cpuA, *gpuA);
 }
 
 TEST(Vector, Equal) {
@@ -1034,9 +708,7 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
   cpuSrc->rowMax(*cpuIds, *cpuVal);
   gpuSrc->rowMax(*gpuIds, *gpuVal);
 
-  MatrixPtr outVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);
 }
 
 TEST(Matrix, topK) {
@@ -1072,9 +744,7 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
   cpuSrc->rowMax(*cpuIds, *cpuVal);
   gpuSrc->rowMax(*gpuIds, *gpuVal);
 
-  MatrixPtr outCheckMaxVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outCheckMaxVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outCheckMaxVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);
 
   IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
   outCheckIds->copyFrom(*gpuIds);
@@ -1104,42 +774,6 @@ TEST(SMatrix, topK) {
   }
 }
 
-void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(inHeight, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(inHeight, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(outHeight, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(outHeight, width);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuRowIndex = IVector::create(outHeight, false);
-  IVectorPtr gpuRowIndex = IVector::create(outHeight, true);
-  cpuRowIndex->rand(inHeight);
-  gpuRowIndex->copyFrom(*cpuRowIndex);
-
-  cpuOutput->copyByRowIndex(*cpuInput, *cpuRowIndex);
-  gpuOutput->copyByRowIndex(*gpuInput, *gpuRowIndex);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(outHeight, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, copyByRowIndex) {
-  for (auto outHeight : {31, 500, 1000}) {
-    for (auto inHeight : {17, 257, 500, 1200}) {
-      for (auto width : {512, 1024}) {
-        VLOG(3) << outHeight << " " << inHeight << " " << width;
-        testMatrixCopyByRowIndex(outHeight, inHeight, width);
-      }
-    }
-  }
-}
-
 void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
@@ -1160,10 +794,7 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
   gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }
 
 TEST(Matrix, sequenceAvgForward) {
@@ -1178,45 +809,6 @@ TEST(Matrix, sequenceAvgForward) {
   }
 }
 
-void testCosSim(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  prevOutX->add(-0.5);
-  prevOutY->add(-0.5);
-  output->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  outputGpu->copyFrom(*output);
-
-  output->cosSim(*prevOutX, *prevOutY, scale);
-  outputGpu->cosSim(*prevOutXGpu, *prevOutYGpu, scale);
-
-  MatrixPtr outputCheck = CpuMatrix::create(heightX, 1, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckErr(*output, *outputCheck);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
   MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
   MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
@@ -1256,12 +848,8 @@ void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
                             *prevGradYGpu,
                             scale);
 
-  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false, false);
-  prevGradXCheck->copyFrom(*prevGradXGpu);
-  prevGradYCheck->copyFrom(*prevGradYGpu);
-  MatrixCheckErr(*prevGradX, *prevGradXCheck);
-  MatrixCheckErr(*prevGradY, *prevGradYCheck);
+  TensorCheckErr(*prevGradX, *prevGradXGpu);
+  TensorCheckErr(*prevGradY, *prevGradYGpu);
 }
 
 TEST(Matrix, cosSimDerivate) {
@@ -1276,80 +864,6 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-void testParamReluForward(int height, int width, int w_height, int w_width) {
-  MatrixPtr output = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  output->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr outputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  output->paramReluForward(*input, *w);
-  outputGpu->paramReluForward(*inputGpu, *wGpu);
-
-  MatrixPtr outputCheck = CpuMatrix::create(height, width, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckEqual(*output, *outputCheck);
-}
-
-TEST(Matrix, paramReluForward) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluForward(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  w->paramReluBackwardW(*oGrad, *input);
-  wGpu->paramReluBackwardW(*oGradGpu, *inputGpu);
-  MatrixPtr wCheck = CpuMatrix::create(w_height, w_width, false, false);
-  wCheck->copyFrom(*wGpu);
-  MatrixCheckErr(*w, *wCheck);
-}
-
-TEST(Matrix, paramReluBackwardW) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluBackwardW(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluBackwardDiff(int height,
                                int width,
                                int w_height,
@@ -1378,9 +892,7 @@ void testParamReluBackwardDiff(int height,
   diff->paramReluBackwardDiff(*oGrad, *input, *w);
   diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
 
-  MatrixPtr diffCheck = CpuMatrix::create(height, width, false, false);
-  diffCheck->copyFrom(*diffGpu);
-  MatrixCheckErr(*diff, *diffCheck);
+  TensorCheckErr(*diff, *diffGpu);
 }
 
 TEST(Matrix, paramReluBackwardDiff) {
@@ -1411,9 +923,7 @@ void testClassificationError(int numSamples, int dim) {
   cpuError->classificationError(cpuOutput, cpuLabel);
   gpuError->classificationError(gpuOutput, gpuLabel);
 
-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
-  check->copyFrom(*gpuError);
-  MatrixCheckEqual(*cpuError, *check);
+  TensorCheckEqual(*cpuError, *gpuError);
 }
 
 TEST(Matrix, classificationError) {
@@ -1578,9 +1088,8 @@ void testAvgPoolFwdBwd(int numSamples,
                             outW,
                             padH,
                             padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
+
+  TensorCheckErr(*target, *targetGpu);
 
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
@@ -1619,10 +1128,8 @@ void testAvgPoolFwdBwd(int numSamples,
                                 1.0,
                                 padH,
                                 padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetBwdCheck);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
 TEST(Matrix, PoolFwdBwd) {
@@ -1687,11 +1194,9 @@ void testMaxOutFwdBwd(
 
   MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
   MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
 
   IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
   IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
 
   input->randomizeUniform();
   inputGpu->copyFrom(*input);
@@ -1699,11 +1204,8 @@ void testMaxOutFwdBwd(
   target->maxoutForward(*input, *id, outChannels, groups);
   targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
 
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-  idCheck->copyFrom(*idGpu);
-  VectorCheckEqual(*id, *idCheck);
+  TensorCheckErr(*target, *targetGpu);
+  TensorCheckEqual(*id, *idGpu);
 
   // backward
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
@@ -1712,8 +1214,6 @@ void testMaxOutFwdBwd(
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
   MatrixPtr targetGpuGrad =
       GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
@@ -1723,9 +1223,7 @@ void testMaxOutFwdBwd(
   inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
   inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
 
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
 TEST(Matrix, MaxOutFwdBwd) {
@@ -1745,113 +1243,6 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
-void testAddSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuData->addSharedBias(*cpuBias, 1.0);
-  gpuData->addSharedBias(*gpuBias, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
-  check->copyFrom(*gpuData);
-  MatrixCheckErr(*cpuData, *check);
-}
-
-void testCollectSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuBias->collectSharedBias(*cpuData, 1.0);
-  gpuBias->collectSharedBias(*gpuData, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
-  check->copyFrom(*gpuBias);
-  MatrixCheckErr(*cpuBias, *check);
-}
-
-TEST(Matrix, sharedBias) {
-  for (auto numSamples : {1, 100, 520}) {
-    for (auto dim : {100 * 16, 100 * 32}) {
-      for (auto channel : {8, 16}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " channel=" << channel;
-        testAddSharedBias(numSamples, dim, channel);
-        testCollectSharedBias(numSamples, dim, channel);
-      }
-    }
-  }
-}
-
-void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
-  MatrixPtr output = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuEntropy = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuEntropy = std::make_shared<GpuMatrix>(numSamples, 1);
-
-  MatrixPtr cpuGrad = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuGrad = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuLabel = std::make_shared<CpuSparseMatrix>(
-      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  MatrixPtr gpuLabel = std::make_shared<GpuSparseMatrix>(
-      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  for (int i = 0; i < numSamples; i++) {
-    const unsigned int id = rand() % dim;  // NOLINT
-    cpuLabel->setRow(i, 1, &id, nullptr);
-    gpuLabel->setRow(i, 1, &id, nullptr);
-  }
-
-  output->randomizeUniform();
-  cpuOutput->zeroMem();
-  output->softmax(*cpuOutput);
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuEntropy->zeroMem();
-  gpuEntropy->zeroMem();
-  cpuEntropy->multiBinaryLabelCrossEntropy(*cpuOutput, *cpuLabel);
-  gpuEntropy->multiBinaryLabelCrossEntropy(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check1 = std::make_shared<CpuMatrix>(numSamples, 1);
-  check1->copyFrom(*gpuEntropy);
-  MatrixCheckErr(*cpuEntropy, *check1);
-
-  cpuGrad->zeroMem();
-  gpuGrad->zeroMem();
-  cpuGrad->multiBinaryLabelCrossEntropyBp(*cpuOutput, *cpuLabel);
-  gpuGrad->multiBinaryLabelCrossEntropyBp(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check2 = std::make_shared<CpuMatrix>(numSamples, dim);
-  check2->copyFrom(*gpuGrad);
-  MatrixCheckErr(*cpuGrad, *check2);
-}
-
-TEST(Matrix, multiBinaryCrossEntropy) {
-  for (auto numSamples : {100, 1000, 10000}) {
-    for (auto dim : {100, 1000, 10000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testMultiBinaryLabelCrossEntropy(numSamples, dim);
-    }
-  }
-}
-
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);