From a948eea3ed84f950c6c95d84431eeb3717951fc3 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Wed, 4 Jan 2017 15:53:53 -0800
Subject: [PATCH] clean unused code.

---
 paddle/cuda/include/hl_matrix.h             |  42 -----
 paddle/cuda/include/stub/hl_matrix_stub.h   |  19 ---
 paddle/cuda/src/hl_cuda_matrix.cu           | 171 --------------------
 paddle/function/CosSimOp.cpp                |  52 +++---
 paddle/function/CosSimOp.h                  |  20 +--
 paddle/function/CosSimOpTest.cpp            |  24 +--
 paddle/gserver/layers/CosSimLayer.cpp       |  12 +-
 paddle/gserver/layers/CosSimVecMatLayer.cpp |  14 +-
 paddle/gserver/layers/CosSimVecMatLayer.h   |   2 +
 paddle/math/Matrix.cpp                      | 152 -----------------
 paddle/math/Matrix.h                        |  36 -----
 paddle/math/tests/test_Matrix.cpp           |  22 ---
 paddle/math/tests/test_matrixCompare.cpp    |  55 -------
 13 files changed, 64 insertions(+), 557 deletions(-)

diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 40828dd5cc7..6f21b82afdc 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -188,48 +188,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
                                         int width,
                                         int height,
                                         int partial_sum);
-/**
- * @brief cos sim forward
- *
- * @param[out]    output         output data
- * @param[in]     input1         input1 data(matrix)
- * @param[in]     input2         input2 data(matrix or vector)
- * @param[in]     width          matrix width
- * @param[in]     input1_height  input1_height
- * @param[in]     input2_height  input2_height
- * @param[in]     scale          scale factor
- */
-extern void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale);
-/**
- * @brief cos sim derivate
- *
- * @param[in]     grad             output grad
- * @param[in]     output           output data
- * @param[in]     prevOutX         input1 data
- * @param[in]     prevOutY         input2 data
- * @param[out]    prevGradX        input1 grad
- * @param[out]    prevGradY        input2 grad
- * @param[in]     width            matrix width
- * @param[in]     input1_height    input1 height
- * @param[in]     input2_height    input2 height
- * @param[in]     scale            scale factor
- */
-extern void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale);
 
 /**
  * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index a1712d1e4d2..f4e6461cdcf 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -74,25 +74,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
                                         int height,
                                         int partial_sum) {}
 
-inline void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale) {}
-
-inline void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale) {}
-
 inline void hl_matrix_add_shared_bias(real* A_d,
                                       real* B_d,
                                       const int channel,
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index cd23bd31057..96c07d9c3b7 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -584,177 +584,6 @@ void hl_param_relu_backward_diff(real* grad_o,
   CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
 
-template<int blockSize>
-__global__ void KeCosSim(real* output,
-                         real* input1,
-                         real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hl_cossim(real* output,
-               real* input1,
-               real* input2,
-               int width,
-               int input1_height,
-               int input2_height,
-               real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim failed");
-}
-
-template<int blockSize>
-__global__ void KeCosSimDerivative(real* grad,
-                                   real* output,
-                                   real* prevOutX,
-                                   real* prevOutY,
-                                   real* prevGradX,
-                                   real* prevGradY,
-                                   int width,
-                                   int input1_height,
-                                   int input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prevOutX += ty * width;
-  prevGradX += ty * width;
-  if (input2_height > 1) {
-    prevOutY += ty * width;
-    prevGradY += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = prevOutX[index];
-    real y = prevOutY[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] +=
-        scale * grad[ty] * prevOutY[index] * reciprocal;
-      if (input2_height > 1) {
-        prevGradY[index] +=
-          scale * grad[ty] * prevOutX[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index,
-          scale * grad[ty] * prevOutX[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] += output[ty] * grad[ty] *
-        (prevOutY[index] * reciprocalXY -
-         prevOutX[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prevGradY[index] += output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-
-void hl_cossim_derivative(real* grad,
-                          real* output,
-                          real* prevOutX,
-                          real* prevOutY,
-                          real* prevGradX,
-                          real* prevGradY,
-                          int width,
-                          int input1_height,
-                          int input2_height,
-                          real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prevOutX);
-  CHECK_NOTNULL(prevOutY);
-  CHECK_NOTNULL(prevGradX);
-  CHECK_NOTNULL(prevGradY);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
-        input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim_derivate failed");
-}
-
 __global__ void KeMatrixAddSharedBias(real* A,
                                       real* B,
                                       const int channel,
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 0ed5991ff13..bcc17fd5319 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -34,7 +34,6 @@ void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
   CHECK(in2_mat->getHeight() == 1LU || in2_mat->getHeight() == num_samples);
   size_t inc = (in2_mat->getHeight() == 1LU) ? 0 : dim;
   for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    /// for each row, todo(tianbing), use TensorExpression square2 ?
     real square_sum_x = 0;
     real square_sum_y = 0;
     real xy = 0;
@@ -147,12 +146,15 @@ void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix* out_grad,
 }
 
 /**
- * \param inputs[0] output value 1, size: nSamples * 1.
- * \param inputs[1] input value 1, size: nSamples * dim.
- * \param inputs[2] input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param inputs[3] input grad 1, size: nSamples * dim.
- * \param inputs[4] input grad 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output grad, size : nSamples * 1.
+ * \param inouts[0] forward input grad 1, size: nSamples * dim.
+ * \param inouts[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
  */
 template <DeviceType Device>
 class CosSimBackwardFunc : public FunctionBase {
@@ -163,35 +165,35 @@ class CosSimBackwardFunc : public FunctionBase {
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(inputs.size(), 5);
-    CHECK_EQ(outputs.size(), 1);
-    CHECK_EQ(inouts.size(), 0);
+    CHECK_EQ(inputs.size(), 4);
+    CHECK_EQ(outputs.size(), 0);
+    CHECK_EQ(inouts.size(), 2);
     /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(outputs[0].dims_[1], 1UL);
     CHECK_EQ(inputs[0].dims_[1], 1UL);
+    CHECK_EQ(inputs[1].dims_[1], 1UL);
     /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
-    CHECK_EQ(inputs[1].dims_[0], outputs[0].dims_[0]);
-    CHECK_EQ(inputs[3].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[1].dims_[0], inputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].dims_[0], inputs[0].dims_[0]);
+    CHECK_EQ(inouts[0].dims_[0], inputs[0].dims_[0]);
     /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[2].dims_[1], inputs[1].dims_[1]);
-    CHECK_EQ(inputs[3].dims_[1], inputs[1].dims_[1]);
-    CHECK_EQ(inputs[4].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[3].dims_[1], inputs[2].dims_[1]);
+    CHECK_EQ(inouts[0].dims_[1], inputs[2].dims_[1]);
+    CHECK_EQ(inouts[1].dims_[1], inputs[2].dims_[1]);
 
-    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[1].getData() &&
-          inputs[2].getData() && inputs[3].getData() && inputs[4].getData());
+    CHECK(inputs[0].getData() && inputs[1].getData() && inputs[2].getData() &&
+          inputs[3].getData() && inouts[0].getData() && inouts[1].getData());
     const auto out_grad = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto out_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    const auto in1_val = std::make_shared<typename MatrixT<Device>::type>(
+    const auto out_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    const auto in2_val = std::make_shared<typename MatrixT<Device>::type>(
+    const auto in1_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[2].getData(), inputs[2].dims_[0], inputs[2].dims_[1]);
-    auto in1_grad = std::make_shared<typename MatrixT<Device>::type>(
+    const auto in2_val = std::make_shared<typename MatrixT<Device>::type>(
         inputs[3].getData(), inputs[3].dims_[0], inputs[3].dims_[1]);
+    auto in1_grad = std::make_shared<typename MatrixT<Device>::type>(
+        inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
     auto in2_grad = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[4].getData(), inputs[4].dims_[0], inputs[4].dims_[1]);
+        inouts[1].getData(), inouts[1].dims_[0], inouts[1].dims_[1]);
 
     CosSimBackward<Device>(out_grad.get(),
                            out_val.get(),
diff --git a/paddle/function/CosSimOp.h b/paddle/function/CosSimOp.h
index f66a4344d09..ed1f1e4d534 100644
--- a/paddle/function/CosSimOp.h
+++ b/paddle/function/CosSimOp.h
@@ -25,9 +25,9 @@ namespace paddle {
  *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
  *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
  *
- * \param[out]  output            output data.
- * \param[in]   intput1           input data.
- * \param[in]   intput2           input data.
+ * \param[out]  output            output value.
+ * \param[in]   intput1           input value.
+ * \param[in]   intput2           input value.
  * \param[in]   scale             default 1.0.
  *
  */
@@ -40,13 +40,13 @@ void CosSimForward(typename MatrixT<Device>::type* output,
 /**
  * \brief   Cosine Similarity BackWard for Derivative.
  *
- * \param[out]  output1           backward loss output grad.
- * \param[in]   input1            forward-output value.
- * \param[in]   input2            forward input value 1.
- * \param[in]   input3            forward input value 2.
- * \param[in]   input4            forward input grad 1.
- * \param[in]   input5            forward input grad 2.
- * \param[in]   scale             default 1.0.
+ * \param[in]       output grad           backward loss output grad.
+ * \param[in]       output val            forward-output value.
+ * \param[in]       input val1            forward input value 1.
+ * \param[in]       input val2            forward input value 2.
+ * \param[in/out]   input grad            forward input grad 1.
+ * \param[in/out]   input grad            forward input grad 2.
+ * \param[in]       scale                 default 1.0.
  *
  */
 template <DeviceType Device>
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
index 0d2ece2b986..f0e81ee0434 100644
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
@@ -97,22 +97,22 @@ void testCosSimBackward(size_t height_x,
   gpu_in2_grad.copyFrom(cpu_in2_grad);
 
   compare.getCpuFunction()->calc(
-      {Tensor(cpu_out_val.getData(), Dims{height_x, 1}),
+      {Tensor(cpu_out_grad.getData(), Dims{height_x, 1}),
+       Tensor(cpu_out_val.getData(), Dims{height_x, 1}),
        Tensor(cpu_in1_val.getData(), Dims{height_x, width}),
-       Tensor(cpu_in2_val.getData(), Dims{height_x, width}),
-       Tensor(cpu_in1_grad.getData(), Dims{height_x, width}),
-       Tensor(cpu_in2_grad.getData(), Dims{height_x, width})},
-      {Tensor(cpu_out_grad.getData(), Dims{height_x, 1})},
-      {});
+       Tensor(cpu_in2_val.getData(), Dims{height_x, width})},
+      {},
+      {Tensor(cpu_in1_grad.getData(), Dims{height_x, width}),
+       Tensor(cpu_in2_grad.getData(), Dims{height_x, width})});
 
   compare.getGpuFunction()->calc(
-      {Tensor(gpu_out_val.getData(), Dims{height_x, 1}),
+      {Tensor(gpu_out_grad.getData(), Dims{height_x, 1}),
+       Tensor(gpu_out_val.getData(), Dims{height_x, 1}),
        Tensor(gpu_in1_val.getData(), Dims{height_x, width}),
-       Tensor(gpu_in2_val.getData(), Dims{height_x, width}),
-       Tensor(gpu_in1_grad.getData(), Dims{height_x, width}),
-       Tensor(gpu_in2_grad.getData(), Dims{height_x, width})},
-      {Tensor(gpu_out_grad.getData(), Dims{height_x, 1})},
-      {});
+       Tensor(gpu_in2_val.getData(), Dims{height_x, width})},
+      {},
+      {Tensor(gpu_in1_grad.getData(), Dims{height_x, width}),
+       Tensor(gpu_in2_grad.getData(), Dims{height_x, width})});
 
   autotest::TensorCheckErr(cpu_in1_grad, gpu_in1_grad);
   autotest::TensorCheckErr(cpu_in2_grad, gpu_in2_grad);
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index ac66fd4712a..b00eda2f64a 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -79,13 +79,13 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
     auto inG2 = this->getInputGrad(1);
     CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
     backward_[0]->calc(
-        {Tensor(outV->getData(), Dims{outV->getHeight(), outV->getWidth()}),
+        {Tensor(outG->getData(), Dims{outG->getHeight(), outG->getWidth()}),
+         Tensor(outV->getData(), Dims{outV->getHeight(), outV->getWidth()}),
          Tensor(inV1->getData(), Dims{inV1->getHeight(), inV1->getWidth()}),
-         Tensor(inV2->getData(), Dims{inV2->getHeight(), inV2->getWidth()}),
-         Tensor(inG1->getData(), Dims{inG1->getHeight(), inG1->getWidth()}),
-         Tensor(inG2->getData(), Dims{inG2->getHeight(), inG2->getWidth()})},
-        {Tensor(outG->getData(), Dims{outG->getHeight(), outG->getWidth()})},
-        {});
+         Tensor(inV2->getData(), Dims{inV2->getHeight(), inV2->getWidth()})},
+        {},
+        {Tensor(inG1->getData(), Dims{inG1->getHeight(), inG1->getWidth()}),
+         Tensor(inG2->getData(), Dims{inG2->getHeight(), inG2->getWidth()})});
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 427191e5407..120c4e84c55 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -169,19 +169,19 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
     tmpRow3->setData(outG->rowBuf(i));
 
     backward_[0]->calc(
-        {Tensor(tmpRow2->getData(),
+        {Tensor(tmpRow3->getData(),
+                Dims{tmpRow3->getHeight(), tmpRow3->getWidth()}),
+         Tensor(tmpRow2->getData(),
                 Dims{tmpRow2->getHeight(), tmpRow2->getWidth()}),
          Tensor(tmpMtx0->getData(),
                 Dims{tmpMtx0->getHeight(), tmpMtx0->getWidth()}),
          Tensor(tmpRow0->getData(),
-                Dims{tmpRow0->getHeight(), tmpRow0->getWidth()}),
-         Tensor(tmpMtx1->getData(),
+                Dims{tmpRow0->getHeight(), tmpRow0->getWidth()})},
+        {},
+        {Tensor(tmpMtx1->getData(),
                 Dims{tmpMtx1->getHeight(), tmpMtx1->getWidth()}),
          Tensor(tmpRow1->getData(),
-                Dims{tmpRow1->getHeight(), tmpRow1->getWidth()})},
-        {Tensor(tmpRow3->getData(),
-                Dims{tmpRow3->getHeight(), tmpRow3->getWidth()})},
-        {});
+                Dims{tmpRow1->getHeight(), tmpRow1->getWidth()})});
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.h b/paddle/gserver/layers/CosSimVecMatLayer.h
index bee83b58154..df4e11848c4 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.h
+++ b/paddle/gserver/layers/CosSimVecMatLayer.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index a8b53e2105b..1964b2f8bfa 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -941,59 +941,6 @@ void GpuMatrix::softreluDerivative(Matrix& output) {
 void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
   BaseMatrix::scaledTanh(output, p1, p2);
 }
-void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
-      << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-  real* out = getData();
-  real* x = output1.getData();
-  real* y = output2.getData();
-  hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
-}
-void GpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
-        prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
-        prevGrad2.useGpu_ == true)
-      << "Matrix type are not equal";
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  real* grad = getData();
-  real* out = output.getData();
-  real* prevOutX = prevOut1.getData();
-  real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad,
-                       out,
-                       prevOutX,
-                       prevOutY,
-                       prevGradX,
-                       prevGradY,
-                       dim,
-                       prevOut1.getHeight(),
-                       prevOut2.getHeight(),
-                       scale);
-}
 
 void GpuMatrix::randomizeUniform() {
   CHECK(isContiguous());
@@ -3470,105 +3417,6 @@ void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
   }
 }
 
-void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-
-  real* out = getData();
-  const real* x = output1.getData();
-  const real* y = output2.getData();
-  size_t yInc = dim;
-  if (output2.getHeight() == 1LU) {
-    yInc = 0;
-  } else {
-    CHECK_EQ(output2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i, x += dim, y += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(x[j]);
-      squareSumY += _square(y[j]);
-      xy += x[j] * y[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    out[i] = scale * xy / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-  }
-}
-
-void CpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  const real* grad = getData();
-  const real* out = output.getData();
-  const real* prevOutX = prevOut1.getData();
-  const real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  size_t yInc = dim;
-  if (prevOut2.getHeight() == 1LU) {
-    yInc = 0;
-    CHECK_EQ(prevGrad2.getHeight(), 1LU);
-  } else {
-    CHECK_EQ(prevOut2.getHeight(), numSamples);
-    CHECK_EQ(prevGrad2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i,
-              prevOutX += dim,
-              prevOutY += yInc,
-              prevGradX += dim,
-              prevGradY += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(prevOutX[j]);
-      squareSumY += _square(prevOutY[j]);
-      xy += prevOutX[j] * prevOutY[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    if (xy == 0) {
-      real reciprocal = 1.0f / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += scale * grad[i] * prevOutY[j] * reciprocal;
-        prevGradY[j] += scale * grad[i] * prevOutX[j] * reciprocal;
-      }
-    } else {
-      real reciprocalXY = 1.0f / xy;
-      real reciprocalSquareSumX = 1.0f / squareSumX;
-      real reciprocalSquareSumY = 1.0f / squareSumY;
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += out[i] * grad[i] * (prevOutY[j] * reciprocalXY -
-                                            prevOutX[j] * reciprocalSquareSumX);
-        prevGradY[j] += out[i] * grad[i] * (prevOutX[j] * reciprocalXY -
-                                            prevOutY[j] * reciprocalSquareSumY);
-      }
-    }
-  }
-}
-
 void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
   CHECK(output.useGpu_ == false && label.useGpu_ == false)
       << "Matrix type are not equal";
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index c92c0a272d5..ea4bbb86b05 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -799,26 +799,6 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
   /// print out the values of elements to os
   virtual void print(std::ostream& os) const {
     LOG(FATAL) << "Not implemented";
@@ -1324,14 +1304,6 @@ public:
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   virtual void print(std::ostream& os) const;
   virtual void print(std::ostream& os, size_t height, size_t width) const;
 
@@ -1752,14 +1724,6 @@ public:
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   void print(std::ostream& os) const;
   void print(std::ostream& os, size_t height, size_t width) const;
   void printOneRow(std::ostream& os, size_t idx) const;
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index a4084bdf7c6..1c21da5b76e 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -181,28 +181,6 @@ TEST(Matrix, copyByRowIndex) {
   }
 }
 
-void testCosSim(int heightX, int heightY, int width, real scale) {
-  AutoCompare test(heightX, 1);
-  CpuMatrix arg1(heightX, width);
-  CpuMatrix arg2(heightY, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg2.add(-0.5);
-  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluForward(int height, int width, int w_height, int w_width) {
   AutoCompare test(height, width);
   CpuMatrix arg1(height, width);
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index e024f2cf1b9..6caaea443c1 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -720,61 +720,6 @@ TEST(Matrix, sequenceAvgForward) {
   }
 }
 
-void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr prevGradX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradY = CpuMatrix::create(heightY, width, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  grad->randomizeUniform();
-  output->randomizeUniform();
-  prevGradX->randomizeUniform();
-  prevGradY->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr gradGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr prevGradXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevGradYGpu = GpuMatrix::create(heightY, width, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  gradGpu->copyFrom(*grad);
-  outputGpu->copyFrom(*output);
-  prevGradXGpu->copyFrom(*prevGradX);
-  prevGradYGpu->copyFrom(*prevGradY);
-
-  grad->cosSimDerivative(
-      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
-
-  gradGpu->cosSimDerivative(*outputGpu,
-                            *prevOutXGpu,
-                            *prevOutYGpu,
-                            *prevGradXGpu,
-                            *prevGradYGpu,
-                            scale);
-
-  TensorCheckErr(*prevGradX, *prevGradXGpu);
-  TensorCheckErr(*prevGradY, *prevGradYGpu);
-}
-
-TEST(Matrix, cosSimDerivate) {
-  for (auto heightX : {1, 10, 100}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {1, 10, 100}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimDerivate(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluBackwardDiff(int height,
                                int width,
                                int w_height,
-- 
GitLab