diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..631464bc841563e15f01d773ae19e4f8fc0a52c3
--- /dev/null
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/modified_huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ModifiedHuberLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& context) const override {
+    PADDLE_ENFORCE_NOT_NULL(context.InputVar("X"), "X must be initialized.");
+    PADDLE_ENFORCE_NOT_NULL(context.InputVar("Y"), "Y must be initialized.");
+
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+
+    PADDLE_ENFORCE_EQ(x->dims(), y->dims(),
+                      "Dimensions of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(framework::arity(x->dims()), 2,
+                      "Tensor rank of X must be 2.");
+    PADDLE_ENFORCE_EQ(x->dims()[1], 1, "Second dimension of X must be 1.");
+
+    context.Output<Tensor>("intermediate_val")->Resize(x->dims());
+    context.Output<Tensor>("Out")->Resize({x->dims()[0], 1});
+  }
+};
+
+class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ModifiedHuberLossOpMaker(framework::OpProto* proto,
+                           framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("Y", "");
+    AddOutput("intermediate_val", "").AsIntermediate();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* intermediate_val = context.Input<Tensor>("intermediate_val");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* y_grad = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    PADDLE_ENFORCE_NOT_NULL(x, "Input X must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(y, "Target Y must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(intermediate_val,
+                            "Intermediate value must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(out_grad, "Out gradient must not be null.");
+
+    PADDLE_ENFORCE_EQ(
+        intermediate_val->dims(), x->dims(),
+        "Dimension of X and intermediate value must be the same.");
+    PADDLE_ENFORCE_EQ(
+        out_grad->dims(), x->dims(),
+        "Dimension of Out gradient and X must be the same (N*1).");
+
+    if (x_grad) x_grad->Resize(x->dims());
+    if (y_grad) y_grad->Resize(y->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
+            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
+            ops::ModifiedHuberLossGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    modified_huber_loss,
+    ops::ModifiedHuberLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
+                       ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06c710e0c5ef1c9d86df629dccf57a4043b744e7
--- /dev/null
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/modified_huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ModifiedHuberLossGradGPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // auto* in0 = context.Input<Tensor>("X");
+    // auto* in1 = context.Input<Tensor>("Y");
+    // auto* in2 = context.Input<Tensor>("intermediate_val");
+    // auto* in3 = context.Input<Tensor>(framework::GradVarName("Out"));
+    // auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    // auto* out1 = context.Output<Tensor>(framework::GradVarName("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    modified_huber_loss,
+    ops::ModifiedHuberLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(modified_huber_loss_grad,
+                       ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a429ab2e47117834f22534e1e9af3b9ca34827f
--- /dev/null
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+struct CheckLabelValue {
+  HOSTDEVICE T operator()(const T& val) const {
+    PADDLE_ASSERT(val == static_cast<T>(0) || val == static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct ModifiedHuberLossForward {
+  HOSTDEVICE T operator()(const T& val) const {
+    if (val < -1) {
+      return -4 * val;
+    } else if (val < 1) {
+      return (1 - val) * (1 - val);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ModifiedHuberLossKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<Tensor>("intermediate_val");
+    auto* out1 = context.Output<Tensor>("Out");
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto place = context.GetEigenDevice<Place>();
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    // make sure value's of Y in {0, 1}
+    y.unaryExpr(CheckLabelValue<T>());
+    auto inter_val = EigenVector<T>::Flatten(*out0);
+    // scale y to {-1, +1} and compute x * y
+    inter_val.device(place) = x * (2 * y - static_cast<T>(1));
+    auto loss = EigenVector<T>::Flatten(*out1);
+    loss.device(place) = inter_val.unaryExpr(ModifiedHuberLossForward<T>());
+  }
+};
+
+// Use thrust lib to unify cpu and gpu
+// CPU backward kernel
+template <typename T>
+class ModifiedHuberLossGradCPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* in2 = context.Input<Tensor>("intermediate_val");
+    auto* in3 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out1 = context.Output<Tensor>(framework::GradVarName("X"));
+
+    // loop inter_val (x<-1) (x<1) otherwise
+    const T* p_inter_val = in2->data<T>();
+    const T* p_out_grad = in3->data<T>();
+    size_t counts = static_cast<size_t>(framework::product(in2->dims()));
+
+    if (out0) {
+      T* p_x_grad = out0->mutable_data<T>(context.GetPlace());
+      const T* p_y = in1->data<T>();
+      ModifiedHuberLossBackward(p_inter_val, p_y, p_out_grad, p_x_grad, counts);
+    }
+
+    if (out1) {
+      T* p_y_grad = out1->mutable_data<T>(context.GetPlace());
+      const T* p_x = in0->data<T>();
+      ModifiedHuberLossBackward(p_inter_val, p_x, p_out_grad, p_y_grad, counts);
+    }
+  }
+
+ protected:
+  void ModifiedHuberLossBackward(const T* p_inter_data, const T* p_in_data,
+                                 const T* p_in_grad, T* p_out_grad,
+                                 size_t counts) const {
+    for (size_t i = 0; i < counts; ++i) {
+      if (p_inter_data[i] < -1) {
+        p_out_grad[i] = -4 * p_in_data[i] * p_in_grad[i];
+      } else if (p_inter_data[i] < 1) {
+        p_out_grad[i] =
+            -2 * (1 - p_inter_data[i]) * p_in_data[i] * p_in_grad[i];
+      } else {
+        p_out_grad[i] = 0;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c21ad3470ba97f533b6c42bc2966be04bc6f7976..79e02c57b842847fc0a84fbff9f8f27460645e17 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -50,6 +50,7 @@ USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
 USE_OP(squared_l2_distance);
+USE_OP(modified_huber_loss);
 
 namespace paddle {
 namespace framework {
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index a9c33ea1631e8358c41a8566de9db4bd00fc9b74..10e1c3396236685bbcbc7e3531b96387fe894fce 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -34,3 +34,4 @@ py_test(test_lookup_table SRCS test_lookup_table.py)
 py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
 py_test(mnist SRCS mnist.py)
 py_test(test_squared_l2_distance_op SRCS test_squared_l2_distance_op.py)
+py_test(test_modified_huber_loss_op SRCS test_modified_huber_loss_op.py)