diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1473a19ede5c438de479cbf4109c0379cb32393
--- /dev/null
+++ b/paddle/function/CosSimOp.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+template <>
+void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
+                                    const CpuMatrix* in1_mat,
+                                    const CpuMatrix* in2_mat,
+                                    real scale) {
+  CHECK(out_mat && in1_mat && in2_mat);
+  size_t num_samples = out_mat->getHeight();
+  size_t dim = in1_mat->getWidth();
+  /// column vector [nSamples, 1]
+  real* out = out_mat->getData();
+  const real* x = in1_mat->getData();
+  const real* y = in2_mat->getData();
+
+  /// in2 might only have one row or full rows
+  CHECK(in2_mat->getHeight() == 1LU || in2_mat->getHeight() == num_samples);
+  size_t inc = (in2_mat->getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
+    /// for each row, todo(tianbing), use TensorExpression square2 ?
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += x[j] * x[j];
+      square_sum_y += y[j] * y[j];
+      xy += x[j] * y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+  }
+}
+
+/**
+ * \param inputs[0] input matrix 1, size: nSamples * dim.
+ * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param outputs[0] output matrix, size : nSamples * 1.
+ */
+
+template <DeviceType Device>
+class CosSimForwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(inputs.size(), 2);
+    CHECK_EQ(outputs.size(), 1);
+    CHECK_EQ(inouts.size(), 0);
+
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(outputs[0].dims_[1], 1UL);
+
+    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[1].getData());
+    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in1_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    const auto in2_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+
+    CosSimForward<Device>(out_mat.get(), in1_mat.get(), in2_mat.get(), scale_);
+  }
+
+private:
+  real scale_;
+};
+
+REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.h b/paddle/function/CosSimOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..02250d6db9c644549589c0cf8f2cc110d5f740c3
--- /dev/null
+++ b/paddle/function/CosSimOp.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Cosine Similarity Forward.
+ * for each row i,
+ * out[i] = scale * cos(in1[i], in2[i])
+ *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
+ *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
+ *
+ * \param[out]  output            output data.
+ * \param[in]   intput1           input data.
+ * \param[in]   intput2           input data.
+ * \param[in]   scale             default 1.0.
+ *
+ */
+template <DeviceType Device>
+void CosSimForward(typename MatrixT<Device>::type* output,
+                   const typename MatrixT<Device>::type* input1,
+                   const typename MatrixT<Device>::type* input2,
+                   real scale);
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..34835fa5d87c398faa3f52769d317995bbc44bfa
--- /dev/null
+++ b/paddle/function/CosSimOpGpu.cu
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "CosSimOp.h"
+
+namespace paddle {
+
+template<int block_size>
+__global__ void KeCosSim(real* output,
+                         const real* input1,
+                         const real* input2,
+                         int width,
+                         int input1_height,
+                         int input2_height,
+                         real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[block_size];
+  __shared__ real yy[block_size];
+  __shared__ real xy[block_size];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  input1 += ty * width;
+  if (input2_height > 1) {
+    input2 += ty * width;
+  }
+  for (int index = tid; index < width; index += block_size) {
+    real x = input1[index];
+    real y = input2[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = block_size / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
+  }
+}
+
+void hlCossim(real* output,
+               const real* input1,
+               const real* input2,
+               size_t width,
+               size_t input1_height,
+               size_t input2_height,
+               real scale) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input1);
+  CHECK_NOTNULL(input2);
+  const int block_size = 256;
+  dim3 threads(block_size, 1);
+  dim3 grid(1, input1_height);
+
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (output, input1, input2, width, input1_height, input2_height, scale);
+  CHECK_SYNC("hl_cossim failed");
+}
+
+template <>
+void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix* out_mat,
+                                    const GpuMatrix* in1_mat,
+                                    const GpuMatrix* in2_mat,
+                                    real scale) {
+  CHECK(out_mat && in1_mat && in2_mat);
+  CHECK(in1_mat->useGpu_ == true && in2_mat->useGpu_ == true)
+      << "Matrix type are not GPU";
+
+  size_t numSamples = out_mat->getHeight();
+  size_t dim = in1_mat->getWidth();
+  real* out = out_mat->getData();
+  const real* x = in1_mat->getData();
+  const real* y = in2_mat->getData();
+  hlCossim(out, x, y, dim, in1_mat->getHeight(), in2_mat->getHeight(), scale);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49c54620feb3557ae60c2ea624e5f0f2e5934149
--- /dev/null
+++ b/paddle/function/CosSimOpTest.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testCosSimForward(size_t height_x,
+                       size_t height_y,
+                       size_t width,
+                       real scale) {
+  FunctionCompare compare("CosSimForward", FuncConfig().set("scale", scale));
+
+  CpuMatrix cpu_arg1(height_x, width);
+  CpuMatrix gpu_arg1(height_x, width);
+  CpuMatrix cpu_arg2(height_y, width);
+  CpuMatrix gpu_arg2(height_y, width);
+  cpu_arg1.randomizeUniform();
+  gpu_arg1.copyFrom(cpu_arg1);
+  cpu_arg2.randomizeUniform();
+  cpu_arg2.add(-0.5);
+  gpu_arg2.copyFrom(cpu_arg2);
+  CpuMatrix cpu_out(height_x, 1);
+  GpuMatrix gpu_out(height_x, 1);
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_arg1.getData(), Dims{height_x, width}),
+       Tensor(cpu_arg2.getData(), Dims{height_y, width})},
+      {Tensor(cpu_out.getData(), Dims{height_x, 1})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_arg1.getData(), Dims{height_x, width}),
+       Tensor(gpu_arg2.getData(), Dims{height_y, width})},
+      {Tensor(gpu_out.getData(), Dims{height_x, 1})},
+      {});
+
+  autotest::TensorCheckErr(cpu_out, gpu_out);
+}
+
+TEST(Matrix, cosSim) {
+  for (auto height_x : {10, 100, 1000}) {
+    for (auto height_y : {1, height_x}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimForward(height_x, height_y, width, scale);
+        }
+      }
+    }
+  }
+}