fix bugs

f567603f · xixiaoyao · fa5b0a9c · f567603f · f567603f · f567603f
14 changed file
--- a/config_demo1.yaml
+++ b/config_demo1.yaml
@@ -5,8 +5,8 @@ save_path: "output_model/firstrun"
 backbone: "bert"
 backbone_config_path: "pretrain_model/bert/bert_config.json"
-batch_size: 5
+batch_size: 4
-num_epochs: 3
+num_epochs: 2
 optimizer: "adam"
 learning_rate: 3e-5
 warmup_proportion: 0.1

--- a/config_demo2.yaml
+++ b/config_demo2.yaml
-task_instance: "mrqa, mlm4mrqa, match4mrqa"
+task_instance: "mrqa, match4mrqa"
-target_tag: 1, 0, 0
+target_tag: 1, 0
-mix_ratio: 0.5, 1.0, 0.5
+mix_ratio: 0.5, 0.5
 save_path: "output_model/secondrun"
@@ -11,8 +11,8 @@ vocab_path: "pretrain_model/ernie/vocab.txt"
 do_lower_case: True
 max_seq_len: 512
-batch_size: 5
+batch_size: 4
-num_epochs: 5
+num_epochs: 2
 optimizer: "adam"
 learning_rate: 3e-5
 warmup_proportion: 0.1

--- a/config_demo3.py
+++ b/config_demo3.py
-task_instance: "mrqa"
+task_instance: "mlm4mrqa"
 save_path: "output_model/firstrun"
-backbone: "bert"
+backbone: "ernie"
-backbone_config_path: "pretrain_model/bert/bert_config.json"
+backbone_config_path: "pretrain_model/ernie/ernie_config.json"
-vocab_path: "pretrain_model/bert/vocab.txt"
+vocab_path: "pretrain_model/ernie/vocab.txt"
 do_lower_case: True
 max_seq_len: 512
 batch_size: 5
-num_epochs: 3
+num_epochs: 100
 optimizer: "adam"
 learning_rate: 3e-5
 warmup_proportion: 0.1

--- a/demo3.py
+++ b/demo3.py
@@ -5,6 +5,6 @@ if __name__ == '__main__':
    controller.load_pretrain('pretrain_model/ernie/params')
    controller.train()
-    controller = palm.Controller(config='config_demo3.yaml', task_dir='demo3_tasks', for_train=False)
+    # controller = palm.Controller(config='config_demo3.yaml', task_dir='demo3_tasks', for_train=False)
-    controller.pred('cls4mrqa', inference_model_dir='output_model/thirdrun/infer_model')
+    # controller.pred('cls4mrqa', inference_model_dir='output_model/thirdrun/infer_model')
--- a/demo3_tasks/mlm4mrqa.yaml
+++ b/demo3_tasks/mlm4mrqa.yaml
+train_file: "data/mlm4mrqa/train.tsv"
+reader: mlm
+paradigm: mlm
--- a/nohup.out
+++ b/nohup.out
-W1028 21:51:59.319365  9630 device_context.cc:235] Please NOTE: device: 0, CUDA Capability: 61, Driver API Version: 10.1, Runtime API Version: 9.0
-W1028 21:51:59.323333  9630 device_context.cc:243] device: 0, cuDNN Version: 7.3.
-I1028 21:52:26.817137  9630 parallel_executor.cc:421] The number of CUDAPlace, which is used in ParallelExecutor, is 8. And the Program will be copied 8 copies
-W1028 21:52:41.982228  9630 fuse_all_reduce_op_pass.cc:72] Find all_reduce operators: 401. To make the speed faster, some all_reduce ops are fused during training, after fusion, the number of all_reduce ops is 255.
-I1028 21:52:42.243458  9630 build_strategy.cc:363] SeqOnlyAllReduceOps:0, num_trainers:1
-I1028 21:53:14.242537  9630 parallel_executor.cc:285] Inplace strategy is enabled, when build_strategy.enable_inplace = True
-I1028 21:53:16.313246  9630 parallel_executor.cc:368] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
-/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/executor.py:774: UserWarning: The following exception is not an EOF exception.
-  "The following exception is not an EOF exception.")
-Traceback (most recent call last):
-  File "demo2.py", line 6, in <module>
-    controller.train()
-  File "/home/ssd7/yiming/release/PALM/paddlepalm/mtl_controller.py", line 669, in train
-    fluid.io.save_persistables(self.exe, save_path, saver_program)
-  File "/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/io.py", line 571, in save_persistables
-    filename=filename)
-  File "/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/io.py", line 216, in save_vars
-    filename=filename)
-  File "/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/io.py", line 256, in save_vars
-    executor.run(save_program)
-  File "/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/executor.py", line 775, in run
-    six.reraise(*sys.exc_info())
-  File "/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/executor.py", line 770, in run
-    use_program_cache=use_program_cache)
-  File "/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/executor.py", line 817, in _run_impl
-    use_program_cache=use_program_cache)
-  File "/home/zhangyiming/env-bert/lib/python2.7/site-packages/paddle/fluid/executor.py", line 894, in _run_program
-    fetch_var_name)
-paddle.fluid.core_avx.EnforceNotMet: 
--------------------------------------------
-C++ Call Stacks (More useful to developers):
--------------------------------------------
-0   std::string paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int)
-1   paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int)
-2   paddle::operators::SaveOpKernel<paddle::platform::CUDADeviceContext, float>::SaveLodTensor(paddle::framework::ExecutionContext const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&, paddle::framework::Variable const*) const
-3   paddle::operators::SaveOpKernel<paddle::platform::CUDADeviceContext, float>::Compute(paddle::framework::ExecutionContext const&) const
-4   std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::SaveOpKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::SaveOpKernel<paddle::platform::CUDADeviceContext, double>, paddle::operators::SaveOpKernel<paddle::platform:I1029 10:38:26.419725 30194 parallel_executor.cc:421] The number of CUDAPlace, which is used in ParallelExecutor, is 8. And the Program will be copied 8 copies
-W1029 10:38:48.046470 30194 fuse_all_reduce_op_pass.cc:72] Find all_reduce operators: 401. To make the speed faster, some all_reduce ops are fused during training, after fusion, the number of all_reduce ops is 255.
-I1029 10:38:48.322405 30194 build_strategy.cc:363] SeqOnlyAllReduceOps:0, num_trainers:1
-I1029 10:39:23.302821 30194 parallel_executor.cc:285] Inplace strategy is enabled, when build_strategy.enable_inplace = True
-I1029 10:39:25.419924 30194 parallel_executor.cc:368] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
-W1029 10:42:46.438006 30194 init.cc:212] *** Aborted at 1572316966 (unix time) try "date -d @1572316966" if you are using GNU date ***
-W1029 10:42:46.440183 30194 init.cc:212] PC: @                0x0 (unknown)
-W1029 10:42:46.440296 30194 init.cc:212] *** SIGTERM (@0x1f80000785a) received by PID 30194 (TID 0x7f0773d5e700) from PID 30810; stack trace: ***
-W1029 10:42:46.441951 30194 init.cc:212]     @     0x7f0773528160 (unknown)
-W1029 10:42:46.443789 30194 init.cc:212]     @     0x7f07735243cc __pthread_cond_wait
-W1029 10:42:46.444838 30194 init.cc:212]     @     0x7f0726a0c3cc std::condition_variable::wait()
-W1029 10:42:46.449384 30194 init.cc:212]     @     0x7f070292290d paddle::framework::details::FastThreadedSSAGraphExecutor::Run()
-W1029 10:42:46.450734 30194 init.cc:212]     @     0x7f07028836a7 _ZNSt17_Function_handlerIFvvEZN6paddle9framework7details29ScopeBufferedSSAGraphExecutor3RunERKSt6vectorISsSaISsEEEUlvE_E9_M_invokeERKSt9_Any_data
-W1029 10:42:46.454063 30194 init.cc:212]     @     0x7f07028884bf paddle::framework::details::ScopeBufferedMonitor::Apply()
-W1029 10:42:46.455735 30194 init.cc:212]     @     0x7f0702883e86 paddle::framework::details::ScopeBufferedSSAGraphExecutor::Run()
-W1029 10:42:46.458518 30194 init.cc:212]     @     0x7f0700626038 paddle::framework::ParallelExecutor::Run()
-W1029 10:42:46.459216 30194 init.cc:212]     @     0x7f0700409e78 _ZZN8pybind1112cpp_function10initializeIZN6paddle6pybindL22pybind11_init_core_avxERNS_6moduleEEUlRNS2_9framework16ParallelExecutorERKSt6vectorISsSaISsEEE188_S9_INS6_9LoDTensorESaISF_EEIS8_SD_EINS_4nameENS_9is_methodENS_7siblingEEEEvOT_PFT0_DpT1_EDpRKT2_ENUlRNS_6detail13function_callEE1_4_FUNESY_
-W1029 10:42:46.460702 30194 init.cc:212]     @     0x7f0700453f56 pybind11::cpp_function::dispatcher()
-W1029 10:42:46.462498 30194 init.cc:212]     @     0x7f0773841cc8 PyEval_EvalFrameEx
-W1029 10:42:46.464206 30194 init.cc:212]     @     0x7f077384435d PyEval_EvalCodeEx
-W1029 10:42:46.465894 30194 init.cc:212]     @     0x7f0773841d50 PyEval_EvalFrameEx
-W1029 10:42:46.467593 30194 init.cc:212]     @     0x7f077384435d PyEval_EvalCodeEx
-W1029 10:42:46.469327 30194 init.cc:212]     @     0x7f0773841d50 PyEval_EvalFrameEx
-W1029 10:42:46.471053 30194 init.cc:212]     @     0x7f077384435d PyEval_EvalCodeEx
-W1029 10:42:46.472759 30194 init.cc:212]     @     0x7f0773841d50 PyEval_EvalFrameEx
-W1029 10:42:46.474479 30194 init.cc:212]     @     0x7f077384435d PyEval_EvalCodeEx
-W1029 10:42:46.476193 30194 init.cc:212]     @     0x7f0773841d50 PyEval_EvalFrameEx
-W1029 10:42:46.477926 30194 init.cc:212]     @     0x7f077384435d PyEval_EvalCodeEx
-W1029 10:42:46.479651 30194 init.cc:212]     @     0x7f0773844492 PyEval_EvalCode
-W1029 10:42:46.481353 30194 init.cc:212]     @     0x7f077386e1a2 PyRun_FileExFlags
-W1029 10:42:46.483080 30194 init.cc:212]     @     0x7f077386f539 PyRun_SimpleFileExFlags
-W1029 10:42:46.484786 30194 init.cc:212]     @     0x7f07738851bd Py_Main
-W1029 10:42:46.487162 30194 init.cc:212]     @     0x7f0772a82bd5 __libc_start_main
-W1029 10:42:46.487229 30194 init.cc:212]     @           0x4007a1 (unknown)
-W1029 10:42:46.488940 30194 init.cc:212]     @                0x0 (unknown)
-./run_demo2.sh: line 5: 30194 Terminated              python demo2.py >> demo2.log
-W1029 10:43:27.495725 32687 device_context.cc:235] Please NOTE: device: 0, CUDA Capability: 61, Driver API Version: 10.1, Runtime API Version: 9.0
-W1029 10:43:27.500324 32687 device_context.cc:243] device: 0, cuDNN Version: 7.3.
-I1029 10:43:41.409127 32687 parallel_executor.cc:421] The number of CUDAPlace, which is used in ParallelExecutor, is 8. And the Program will be copied 8 copies
-W1029 10:44:03.299010 32687 fuse_all_reduce_op_pass.cc:72] Find all_reduce operators: 401. To make the speed faster, some all_reduce ops are fused during training, after fusion, the number of all_reduce ops is 255.
-I1029 10:44:03.584228 32687 build_strategy.cc:363] SeqOnlyAllReduceOps:0, num_trainers:1
-I1029 10:44:39.690382 32687 parallel_executor.cc:285] Inplace strategy is enabled, when build_strategy.enable_inplace = True
-I1029 10:44:42.244774 32687 parallel_executor.cc:368] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
-W1029 10:48:20.253201 32687 init.cc:212] *** Aborted at 1572317300 (unix time) try "date -d @1572317300" if you are using GNU date ***
-W1029 10:48:20.255347 32687 init.cc:212] PC: @                0x0 (unknown)
-W1029 10:48:20.255458 32687 init.cc:212] *** SIGTERM (@0x1f80000785a) received by PID 32687 (TID 0x7f0f71d25700) from PID 30810; stack trace: ***
-W1029 10:48:20.257107 32687 init.cc:212]     @     0x7f0f714ef160 (unknown)
-W1029 10:48:20.258708 32687 init.cc:212]     @     0x7f0f714eb3cc __pthread_cond_wait
-W1029 10:48:20.259734 32687 init.cc:212]     @     0x7f0f249d33cc std::condition_variable::wait()
-W1029 10:48:20.263964 32687 init.cc:212]     @     0x7f0f008e990d paddle::framework::details::FastThreadedSSAGraphExecutor::Run()
-W1029 10:48:20.265229 32687 init.cc:212]     @     0x7f0f0084a6a7 _ZNSt17_Function_handlerIFvvEZN6paddle9framework7details29ScopeBufferedSSAGraphExecutor3RunERKSt6vectorISsSaISsEEEUlvE_E9_M_invokeERKSt9_Any_data
-W1029 10:48:20.268503 32687 init.cc:212]     @     0x7f0f0084f4bf paddle::framework::details::ScopeBufferedMonitor::Apply()
-W1029 10:48:20.270135 32687 init.cc:212]     @     0x7f0f0084ae86 paddle::framework::details::ScopeBufferedSSAGraphExecutor::Run()
-W1029 10:48:20.272866 32687 init.cc:212]     @     0x7f0efe5ed038 paddle::framework::ParallelExecutor::Run()
-W1029 10:48:20.273551 32687 init.cc:212]     @     0x7f0efe3d0e78 _ZZN8pybind1112cpp_function10initializeIZN6paddle6pybindL22pybind11_init_core_avxERNS_6moduleEEUlRNS2_9framework16ParallelExecutorERKSt6vectorISsSaISsEEE188_S9_INS6_9LoDTensorESaISF_EEIS8_SD_EINS_4nameENS_9is_methodENS_7siblingEEEEvOT_PFT0_DpT1_EDpRKT2_ENUlRNS_6detail13function_callEE1_4_FUNESY_
-W1029 10:48:20.274988 32687 init.cc:212]     @     0x7f0efe41af56 pybind11::cpp_function::dispatcher()
-W1029 10:48:20.276706 32687 init.cc:212]     @     0x7f0f71808cc8 PyEval_EvalFrameEx
-W1029 10:48:20.278395 32687 init.cc:212]     @     0x7f0f7180b35d PyEval_EvalCodeEx
-W1029 10:48:20.280076 32687 init.cc:212]     @     0x7f0f71808d50 PyEval_EvalFrameEx
-W1029 10:48:20.281765 32687 init.cc:212]     @     0x7f0f7180b35d PyEval_EvalCodeEx
-W1029 10:48:20.283442 32687 init.cc:212]     @     0x7f0f71808d50 PyEval_EvalFrameEx
-W1029 10:48:20.285133 32687 init.cc:212]     @     0x7f0f7180b35d PyEval_EvalCodeEx
-W1029 10:48:20.286808 32687 init.cc:212]     @     0x7f0f71808d50 PyEval_EvalFrameEx
-W1029 10:48:20.288502 32687 init.cc:212]     @     0x7f0f7180b35d PyEval_EvalCodeEx
-W1029 10:48:20.290176 32687 init.cc:212]     @     0x7f0f71808d50 PyEval_EvalFrameEx
-W1029 10:48:20.291870 32687 init.cc:212]     @     0x7f0f7180b35d PyEval_EvalCodeEx
-W1029 10:48:20.293542 32687 init.cc:212]     @     0x7f0f7180b492 PyEval_EvalCode
-W1029 10:48:20.295228 32687 init.cc:212]     @     0x7f0f718351a2 PyRun_FileExFlags
-W1029 10:48:20.296922 32687 init.cc:212]     @     0x7f0f71836539 PyRun_SimpleFileExFlags
-W1029 10:48:20.298590 32687 init.cc:212]     @     0x7f0f7184c1bd Py_Main
-W1029 10:48:20.300307 32687 init.cc:212]     @     0x7f0f70a49bd5 __libc_start_main
-W1029 10:48:20.300364 32687 init.cc:212]     @           0x4007a1 (unknown)
-W1029 10:48:20.302006 32687 init.cc:212]     @                0x0 (unknown)
--- a/paddlepalm/mtl_controller.py
+++ b/paddlepalm/mtl_controller.py
@@ -557,7 +557,7 @@ class Controller(object):
        inst.task_layer['pred'] = pred_parad
        pred_joint_input_names, pred_joint_shape_and_dtypes, name_to_position = merge_input_attrs(
            pred_backbone.inputs_attr, inst.task_layer['pred'].inputs_attrs['reader'], 
-            insert_taskid=False)
+            insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
        pred_prog = inst.load(infer_model_path)
        # pred_prog = fluid.CompiledProgram(pred_prog).with_data_parallel()
@@ -664,9 +664,9 @@ class Controller(object):
                                         "step_" + str(global_step))
                fluid.io.save_persistables(self.exe, save_path, saver_program)
-        save_path = os.path.join(main_conf['save_path'],
+        # save_path = os.path.join(main_conf['save_path'],
-                                 "step_" + str(global_step) + "_final")
+        #                          "step_" + str(global_step) + "_final")
-        fluid.io.save_persistables(self.exe, save_path, saver_program)
+        # fluid.io.save_persistables(self.exe, save_path, saver_program)
    def pred(self, task_instance, inference_model_dir=None):
        if self._for_train:

--- a/paddlepalm/reader/cls.py
+++ b/paddlepalm/reader/cls.py
@@ -42,7 +42,7 @@ class Reader(reader):
            self._input_file = config['train_file']
            self._num_epochs = None # 防止iteartor终止
            self._shuffle = config.get('shuffle', False)
-            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+            # self._shuffle_buffer = config.get('shuffle_buffer', 5000)
        elif phase == 'eval':
            self._input_file = config['dev_file']
            self._num_epochs = 1
@@ -56,7 +56,7 @@ class Reader(reader):
        self._phase = phase
        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 1)
+        self._print_first_n = config.get('print_first_n', 0)
    @property

--- a/paddlepalm/reader/mlm.py
+++ b/paddlepalm/reader/mlm.py
@@ -66,7 +66,7 @@ class Reader(reader):
                "input_mask": [[-1, -1, 1], 'float32'],
                "task_ids": [[-1, -1, 1], 'int64'],
                "mask_label": [[-1, 1], 'int64'],
-                "mask_pos": [[-1, 1], 'int64']
+                "mask_pos": [[-1, 1], 'int64'],
                }
@@ -79,6 +79,7 @@ class Reader(reader):
            names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', 
                'task_ids', 'mask_label', 'mask_pos']
            outputs = {n: i for n,i in zip(names, x)}
+            # outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1]
            return outputs
        for batch in self._data_generator():

--- a/paddlepalm/task_paradigm/mlm.py
+++ b/paddlepalm/task_paradigm/mlm.py
@@ -34,9 +34,11 @@ class TaskParadigm(task_paradigm):
    def inputs_attrs(self):
        reader = {
            "mask_label": [[-1, 1], 'int64'],
+            "batchsize_x_seqlen": [[1], 'int64'],
            "mask_pos": [[-1, 1], 'int64']}
        if not self._is_training:
            del reader['mask_label']
+            del reader['batchsize_x_seqlen']
        bb = {
            "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'],
            "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']}
@@ -52,6 +54,8 @@ class TaskParadigm(task_paradigm):
    def build(self, inputs):
        if self._is_training:
            mask_label = inputs["reader"]["mask_label"] 
+            # 多任务学习时才需要引入这个，防止其他run其他任务时导致seqlen过小，gather超范围
+            batchsize_x_seqlen = inputs["reader"]["batchsize_x_seqlen"] 
        mask_pos = inputs["reader"]["mask_pos"] 
        word_emb = inputs["backbone"]["embedding_table"]
        enc_out = inputs["backbone"]["encoder_outputs"]
@@ -61,7 +65,12 @@ class TaskParadigm(task_paradigm):
        _param_initializer = fluid.initializer.TruncatedNormal(
            scale=self._initializer_range)
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+        if self._is_training:
+            # 多任务训练时才需要引入这个，防止其他run其他任务时导致seqlen过小，gather超范围
+            #mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+            mask_pos = fluid.layers.elementwise_min(mask_pos, batchsize_x_seqlen)
+        #print(fluid.default_main_program().blocks[0].vars)
        reshaped_emb_out = fluid.layers.reshape(
            x=enc_out, shape=[-1, emb_size])

--- a/paddlepalm/utils/reader_helper.py
+++ b/paddlepalm/utils/reader_helper.py
@@ -143,6 +143,7 @@ def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtype
    def iterator():
        v = verbose
+        has_show_warn = False
        while True:
            id = np.random.choice(task_ids, p=weights)
            results = fake_batch
@@ -150,16 +151,37 @@ def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtype
                print('----- debug joint iterator -----')
                print('sampled task id: '+str(id))
            task_id_tensor = np.array([[id]]).astype("int64")
-            results[0] = task_id_tensor
+            # results[0] = task_id_tensor
            for i in range(dev_count):
-                results[0] = task_id_tensor
+                # 这两个应该是等价的
+                # results[0] = task_id_tensor
+                results[outname_to_pos['__task_id']] = task_id_tensor
+                assert outname_to_pos['__task_id'] == 0
                if id in outbuf:
                    outputs = outbuf[id]
                    del outbuf[id]
                else:
                    outputs = next(iterators[id]) # dict type
+                # if 'token_ids' in outputs:
+                #     val1 = len(outputs['token_ids'])
+                #     val = _check_and_adapt_shape_dtype([val1], [[1], 'int64'])
+                #     results[outname_to_pos['batch_size']] = val
+                #     val2 = len(outputs['token_ids'][0])
+                #     val = _check_and_adapt_shape_dtype([val2], [[1], 'int64'])
+                #     results[outname_to_pos['seqlen']] = val
+                #     val = _check_and_adapt_shape_dtype([val1*val2], [[1], 'int64'])
+                #     results[outname_to_pos['batchsize_x_seqlen']] = val
+                # else:
+                #     if not has_show_warn:
+                #         print('WARNING: token_ids not found in current batch, failed to yield batch_size, seqlen and batchsize_x_seqlen. (This message would be shown only once.)')
+                #         has_show_warn = True
                prefix = iterator_prefixes[id]
                for outname, val in outputs.items():
                    if v > 0:
@@ -192,7 +214,7 @@ def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtype
    return iterator
-def merge_input_attrs(backbone_attr, task_attrs, insert_taskid=True):
+def merge_input_attrs(backbone_attr, task_attrs, insert_taskid=True, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False):
    """
    Args:
        task_attrs(list[dict]|dict): task input attributes, key=attr_name, val=[shape, dtype], support single task and nested tasks
@@ -200,14 +222,28 @@ def merge_input_attrs(backbone_attr, task_attrs, insert_taskid=True):
    if isinstance(task_attrs, dict):
        task_attrs = [task_attrs]
+    ret = []
+    names = []
+    start = 0
    if insert_taskid:
-        ret = [([1,1], 'int64')]
+        ret.append(([1,1], 'int64'))
-        names = ['__task_id']
+        names.append('__task_id')
-        start = 1
+        start += 1
-    else:
-        ret = []
+    if insert_batchsize:
-        names = []
+        ret.append(([1], 'int64'))
-        start = 0
+        names.append('batch_size')
+        start += 1
+    if insert_seqlen:
+        ret.append(([1], 'int64'))
+        names.append('seqlen')
+        start += 1
+    if insert_batchsize_x_seqlen:
+        ret.append(([1], 'int64'))
+        names.append('batchsize_x_seqlen')
+        start += 1
    names += sorted(backbone_attr.keys())
    ret.extend([backbone_attr[k] for k in names[start:]])

--- a/run_demo1.sh
+++ b/run_demo1.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=0
 python demo1.py
--- a/run_demo2.sh
+++ b/run_demo2.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=0
-while true
-do
+python -u demo2.py 
-    python demo2.py
+# GLOG_vmodule=lookup_table_op=4 python -u demo2.py > debug2.log 2>&1
-done
--- a/run_demo3.sh
+++ b/run_demo3.sh
-export CUDA_VISIBLE_DEVICES=0,1
+export CUDA_VISIBLE_DEVICES=0
 python demo3.py