diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 9e565bb23f29786fcfbb6569f019bb3ab7b20e8b..484a6dfd6ccb021d392db85275b35a8149b8a087 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -38,7 +38,8 @@ public:
 class SigmoidOpGrad : public OperatorWithKernel {
 protected:
   void InferShape(const InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.InputSize() == 1,
+    // need to check input size 2 or 3, (dY, Y) or (dY, Y, X)
+    PADDLE_ENFORCE(ctx.InputSize() == 2,
                    "Sigmoid Gradient Op only have one input");
     PADDLE_ENFORCE(ctx.OutputSize() == 1,
                    "Sigmoid Gradient Op only have one output");
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 2ea75b4885e19c536068f301a9f21af993068a8e..8cae2d30ec6f8fe80e67ff249c864eceba34e3f8 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -27,6 +27,7 @@ public:
     auto output = context.Output<Tensor>(0);
     output->mutable_data<T>(context.GetPlace());
 
+    // The clipping is used in Paddle's raw implenmention
     EigenVector<T>::Flatten(*output).device(
         *(context.GetEigenDevice<Place>())) =
         1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(*input)).exp());
@@ -37,7 +38,7 @@ template <typename Place, typename T>
 class SigmoidGradKernel : public OpKernel {
 public:
   void Compute(const ExecutionContext& context) const override {
-    // TODO(qingqing) maybe a helper funciton is needed fo the name x@GRAD
+    // maybe a helper funciton is needed fo the name x@GRAD
     auto y_t = context.Input<Tensor>("Y");
     auto dy_t = context.Input<Tensor>("Y@GRAD");
     auto dx_t = context.Output<Tensor>("X@GRAD");