diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 49a17416c84ac2161318e223993f7c5f5058b672..8d4e0556dd6a70e8436cd13c30dd84343e715d43 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -115,7 +115,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
               "[batch_size, code_length], where code_length represents the "
               "maximum path length from root to leaf nodes.")
         .AsIntermediate();
-    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+    AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
         .SetDefault(2);
     AddComment(R"DOC(
 The hierarchical sigmoid operator organize the classes into a binary tree.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d3ee80ad529b3076a8f0b0b19c02e949f1cb4ad3..835ec4506a9448c3a0d9c4661a42901f4b0e1fe2 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4348,12 +4348,14 @@ def nce(input,
 
 def hsigmoid(input,
              label,
-             num_classes,
-             ptabl=None,
+             num_classes=None,
+             non_leaf_num=None,
+             ptable=None,
              pcode=None,
              param_attr=None,
              bias_attr=None,
-             name=None):
+             name=None,
+             is_costum=False):
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
@@ -4373,7 +4375,8 @@ def hsigmoid(input,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2.
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set
+        non_leaf_num: this defines the number of non-leaf nodes in costumed tree
         ptable: (Variable|None) this variable can store each batch of samples' path to root, 
             it should be in leaf -> root order
             ptable should have the same shape with pcode, and for each sample i ptable[i] indicates a np.array like 
@@ -4409,20 +4412,33 @@ def hsigmoid(input,
     out = helper.create_variable_for_type_inference(dtype)
     pre_out = helper.create_variable_for_type_inference(dtype)
     dim = input.shape[1]
-    if num_classes < 2:
-        raise ValueError("num_classes must not be less than 2.")
-    if (ptable is not None) and (pcode is None):
-        raise ValueError("pcode should not be None when ptable has been set")
-    elif (ptable is None) and (pcode is not None):
-        raise ValueError("ptable should not be None when pcode has been set")
+    if ((num_classes < 2) or (num_classes is None)) and (not is_costum):
+        raise ValueError(
+            "num_classes must not be less than 2 with default tree")
+
+    if (is_costum) and (pcode is None):
+        raise ValueError("pcode should not be None with costum tree")
+    elif (is_costum) and (ptable is None):
+        raise ValueError("ptable should not be None with costum tree")
+    elif (is_costum) and (non_leaf_num is None):
+        raise ValueError("non_leaf_num should not be None with costum tree")
     else:
         pass
 
-    weights = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[num_classes - 1, dim],
-        is_bias=False,
-        dtype=input.dtype)
+    weights = None
+
+    if not is_costum:
+        weights = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=[num_classes - 1, dim],
+            is_bias=False,
+            dtype=input.dtype)
+    else:
+        weights = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=[non_leaf_num, dim],
+            is_bias=False,
+            dtype=input.dtype)
     inputs = {
         "X": input,
         "W": weights,
@@ -4431,12 +4447,20 @@ def hsigmoid(input,
         "Label": label
     }
     if helper.bias_attr:
-        bias = helper.create_parameter(
-            attr=helper.bias_attr,
-            shape=[1, num_classes - 1],
-            is_bias=True,
-            dtype=input.dtype)
-        inputs['Bias'] = bias
+        if not is_costum:
+            bias = helper.create_parameter(
+                attr=helper.bias_attr,
+                shape=[1, num_classes - 1],
+                is_bias=True,
+                dtype=input.dtype)
+            inputs['Bias'] = bias
+        else:
+            bias = helper.create_parameter(
+                attr=helper.bias_attr,
+                shape=[1, non_leaf_num],
+                is_bias=True,
+                dtype=input.dtype)
+            inputs['Bias'] = bias
     helper.append_op(
         type="hierarchical_sigmoid",
         inputs=inputs,
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 50de468dba803d0a2a0c129ad04aac8a3822cdbc..b067e6213c8e54fce940b5c718c43f6cef783e70 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -185,6 +185,23 @@ class TestBook(unittest.TestCase):
                     input=x, label=y, num_classes=2))
         print(str(program))
 
+        program2 = Program()
+
+        with program_guard(program2):
+            x2 = layers.data(name='x2', shape=[4, 8], dtype='float32')
+            y2 = layers.data(name='y2', shape=[4], dtype='int64')
+            ptable = layers.data(name='ptable', shape=[4, 6], dtype='int64')
+            pcode = layers.data(name='pcode', shape=[4, 6], dtype='int64')
+            self.assertIsNotNone(
+                layers.hsigmoid(
+                    input=x2,
+                    label=y2,
+                    non_leaf_num=6,
+                    ptable=ptable,
+                    pcode=pcode,
+                    is_costum=True))
+            print(str(program2))
+
     def test_sequence_expand(self):
         program = Program()
         with program_guard(program):