add max seq len tensor shape update for bert/ernie

6d13d7a4 · Zeyu Chen · 3831c8ba · 6d13d7a4 · 6d13d7a4 · 6d13d7a4
6 changed file
--- a/demo/bert-cls/finetune_with_hub.py
+++ b/demo/bert-cls/finetune_with_hub.py
@@ -27,6 +27,7 @@ import paddle.fluid as fluid
 import paddle_hub as hub

 import reader.cls as reader
+import reader.task_reader as task_reader
 from utils.args import ArgumentGroup, print_arguments
 from paddle_hub.finetune.config import FinetuneConfig

@@ -36,6 +37,7 @@ parser = argparse.ArgumentParser(__doc__)
 train_g = ArgumentGroup(parser, "training", "training options.")
 train_g.add_arg("epoch",             int,    3,       "Number of epoches for fine-tuning.")
 train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
+train_g.add_arg("hub_module_dir",    str,  None,    "PaddleHub module directory")
 train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
 train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
 train_g.add_arg("warmup_proportion", float,  0.1,
@@ -43,12 +45,10 @@ train_g.add_arg("warmup_proportion", float,  0.1,

 data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
 data_g.add_arg("data_dir",      str,  None,  "Path to training data.")
+data_g.add_arg("checkpoint_dir", str,  None,  "Directory to model checkpoint")
 data_g.add_arg("vocab_path",    str,  None,  "Vocabulary path.")
 data_g.add_arg("max_seq_len",   int,  512,   "Number of words of the longest seqence.")
 data_g.add_arg("batch_size",    int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
-data_g.add_arg("in_tokens",     bool, False,
-              "If set, the batch size will be the maximum number of tokens in one batch. "
-              "Otherwise, it will be the maximum number of examples in one batch.")

 args = parser.parse_args()
 # yapf: enable.
@@ -60,7 +60,7 @@ if __name__ == '__main__':
        eval_interval=100,
        save_ckpt_interval=200,
        use_cuda=True,
-        checkpoint_dir="./bert_cls_ckpt",
+        checkpoint_dir=args.checkpoint_dir,
        learning_rate=args.learning_rate,
        num_epoch=args.epoch,
        batch_size=args.batch_size,
@@ -72,34 +72,31 @@ if __name__ == '__main__':
        optimizer=None,
        warmup_proportion=args.warmup_proportion)

-    # loading paddlehub BERT
-    # module = hub.Module(
-    #     module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
-    module = hub.Module(module_dir="./hub_module/ernie-stable.hub_module")
+    # loading Paddlehub BERT
+    module = hub.Module(module_dir=args.hub_module_dir)

-    processor = reader.BERTClassifyReader(
+    reader = reader.BERTClassifyReader(
        data_dir=args.data_dir,
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len)

-    num_labels = len(processor.get_labels())
+    num_labels = len(reader.get_labels())

-    # bert's input tensor, output tensor and forward graph
-    # If you want to fine-tune the pretrain model parameter, please set
-    # trainable to True
-    input_dict, output_dict, train_program = module.context(
-        sign_name="pooled_output", trainable=True)
+    input_dict, output_dict, program = module.context(
+        sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len)

-    with fluid.program_guard(train_program):
+    with fluid.program_guard(program):
        label = fluid.layers.data(name="label", shape=[1], dtype='int64')

+        # Use "pooled_output" for classification tasks on an entire sentence.
+        # Use "sequence_outputs" for token-level output.
        pooled_output = output_dict["pooled_output"]

        # Setup feed list for data feeder
        # Must feed all the tensor of bert's module need
        feed_list = [
-            input_dict["src_ids"].name, input_dict["pos_ids"].name,
-            input_dict["sent_ids"].name, input_dict["input_mask"].name,
+            input_dict["input_ids"].name, input_dict["position_ids"].name,
+            input_dict["segment_ids"].name, input_dict["input_mask"].name,
            label.name
        ]
        # Define a classfication finetune task by PaddleHub's API
@@ -110,6 +107,6 @@ if __name__ == '__main__':
        # will finish training, evaluation, testing, save model automatically
        hub.finetune_and_eval(
            task=cls_task,
-            data_processor=processor,
+            data_reader=reader,
            feed_list=feed_list,
            config=config)
--- a/demo/bert-cls/reader/batching.py
+++ b/demo/bert-cls/reader/batching.py
@@ -148,7 +148,8 @@ def pad_batch_data(insts,
    corresponding position data and input mask.
    """
    return_list = []
-    max_len = max(len(inst) for inst in insts)
+    #max_len = max(len(inst) for inst in insts)
+    max_len = 50
    # Any token included in dict can be used to pad, since the paddings' loss
    # will be masked out by weights and make no effect on parameter gradients.


--- a/demo/bert-cls/run_fintune_with_hub.sh
+++ b/demo/bert-cls/run_fintune_with_hub.sh
@@ -2,13 +2,18 @@ export CUDA_VISIBLE_DEVICES=5

 DATA_PATH=./chnsenticorp_data

-rm -rf ./bert_cls_ckpt
+
+#HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module"
+HUB_MODULE_DIR="./hub_module/ernie_stable.hub_module"
+CKPT_DIR="./ckpt"
+rm -rf $CKPT_DIR
 python -u finetune_with_hub.py \
-                   --batch_size 32 \
-                   --in_tokens false \
+                   --batch_size 64 \
+                   --hub_module_dir=$HUB_MODULE_DIR \
                   --data_dir ${DATA_PATH} \
                   --weight_decay  0.01 \
+                   --checkpoint_dir $CKPT_DIR \
                   --warmup_proportion 0.0 \
                   --epoch 3 \
-                   --max_seq_len 128 \
+                   --max_seq_len 50 \
                   --learning_rate 5e-5
--- a/paddle_hub/finetune/finetune.py
+++ b/paddle_hub/finetune/finetune.py
@@ -41,8 +41,7 @@ def _get_running_device_info(config):
    return place, dev_count


-def _finetune_model(task, data_processor, feed_list, config=None,
-                    do_eval=False):
+def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
    main_program = task.main_program()
    startup_program = task.startup_program()
    loss = task.variable("loss")
@@ -52,10 +51,9 @@ def _finetune_model(task, data_processor, feed_list, config=None,
    batch_size = config.batch_size
    learning_rate = config.learning_rate
    with_memory_optimization = config.with_memory_optimization
-    checkpoint_dir = config.checkpoint_dir
-    checkpoint_path = os.path.join(checkpoint_dir, CKPT_FILE)
+    checkpoint_path = os.path.join(config.checkpoint_dir, CKPT_FILE)
    log_writter = LogWriter(
-        os.path.join(checkpoint_dir, "vdllog"), sync_cycle=10)
+        os.path.join(config.checkpoint_dir, "vdllog"), sync_cycle=10)

    place, dev_count = _get_running_device_info(config)
    with fluid.program_guard(main_program, startup_program):
@@ -64,7 +62,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
        data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)

        if config.finetune_strategy == "bert_finetune":
-            scheduled_lr = bert_finetune(task, main_program, data_processor,
+            scheduled_lr = bert_finetune(task, main_program, data_reader,
                                         config, dev_count)
        elif config.optimizer == "adam":
            optimizer = fluid.optimizer.Adam(learning_rate=config.learning_rate)
@@ -112,7 +110,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
            eval_acc_scalar = logw.scalar(tag="accuracy[evaluate]")

        for epoch in range(last_epoch, num_epoch + 1):
-            train_reader = data_processor.data_generator(
+            train_reader = data_reader.data_generator(
                batch_size=batch_size, phase='train')
            num_trained_examples = acc_sum = loss_sum = 0
            for batch in train_reader():
@@ -144,7 +142,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,

                if global_step % config.save_ckpt_interval == 0:
                    model_saved_dir = os.path.join(
-                        checkpoint_dir, "model_in_step_%d" % global_step)
+                        config.checkpoint_dir, "model_in_step_%d" % global_step)
                    fluid.io.save_persistables(exe, dirname=model_saved_dir)
                    # NOTE: current saved checkpoint machanism is not completed,
                    # it can't restore dataset training status
@@ -157,7 +155,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
                if do_eval and global_step % config.eval_interval == 0:
                    eval_loss, eval_acc, eval_perf = evaluate(
                        task,
-                        data_processor,
+                        data_reader,
                        feed_list,
                        phase="val",
                        config=config)
@@ -165,7 +163,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
                    eval_acc_scalar.add_record(global_step, eval_acc)
                    if eval_acc > best_eval_acc:
                        best_eval_acc = eval_acc
-                        model_saved_dir = os.path.join(checkpoint_dir,
+                        model_saved_dir = os.path.join(config.checkpoint_dir,
                                                       "best_model")
                        logger.info(
                            "best model saved to %s [best accuracy=%.5f]" %
@@ -173,7 +171,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
                        fluid.io.save_persistables(exe, dirname=model_saved_dir)

        # update model and checkpoint
-        model_saved_dir = os.path.join(checkpoint_dir, "final_model")
+        model_saved_dir = os.path.join(config.checkpoint_dir, "final_model")
        fluid.io.save_persistables(exe, dirname=model_saved_dir)
        # NOTE: current saved checkpoint machanism is not completed, it can't
        # resotre dataset training status
@@ -184,20 +182,19 @@ def _finetune_model(task, data_processor, feed_list, config=None,
            last_model_dir=model_saved_dir)

        if do_eval:
-            evaluate(
-                task, data_processor, feed_list, phase="test", config=config)
+            evaluate(task, data_reader, feed_list, phase="test", config=config)
        logger.info("PaddleHub finetune finished.")


-def finetune_and_eval(task, data_processor, feed_list, config=None):
-    _finetune_model(task, data_processor, feed_list, config, do_eval=True)
+def finetune_and_eval(task, data_reader, feed_list, config=None):
+    _finetune_model(task, data_reader, feed_list, config, do_eval=True)


-def finetune(task, data_processor, feed_list, config=None):
-    _finetune_model(task, data_processor, feed_list, config, do_eval=False)
+def finetune(task, data_reader, feed_list, config=None):
+    _finetune_model(task, data_reader, feed_list, config, do_eval=False)


-def evaluate(task, data_processor, feed_list, phase="test", config=None):
+def evaluate(task, data_reader, feed_list, phase="test", config=None):
    inference_program = task.inference_program()
    main_program = task.main_program()
    loss = task.variable("loss")
@@ -208,7 +205,7 @@ def evaluate(task, data_processor, feed_list, phase="test", config=None):
    with fluid.program_guard(inference_program):
        data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
        num_eval_examples = acc_sum = loss_sum = 0
-        test_reader = data_processor.data_generator(
+        test_reader = data_reader.data_generator(
            batch_size=batch_size, phase=phase)
        eval_time_begin = time.time()
        eval_step = 0

--- a/paddle_hub/module/module.py
+++ b/paddle_hub/module/module.py
@@ -37,6 +37,11 @@ import paddle.fluid as fluid
 __all__ = ['Module', 'create_module']


+def set_max_seq_len(program, input_dict):
+    """ Set """
+    pass
+
+
 def create_module(sign_arr,
                  module_dir,
                  processor=None,
@@ -62,7 +67,7 @@ PROCESSOR_NAME = "processor"
 HUB_VAR_PREFIX = "@HUB_%s@"


-class ModuleHelper:
+class ModuleHelper(object):
    def __init__(self, module_dir):
        self.module_dir = module_dir

@@ -82,7 +87,7 @@ class ModuleHelper:
        return os.path.join(self.module_dir, ASSETS_DIRNAME)


-class Module:
+class Module(object):
    def __init__(self,
                 url=None,
                 module_dir=None,
@@ -116,7 +121,7 @@ class Module:
            self._generate_module_info(module_info)
            self._init_with_signature(signatures=signatures)
        else:
-            raise "Error! HubModule Can't init with nothing"
+            raise "Error! HubModule can't init with nothing"

    def _init_with_url(self, url):
        utils.check_url(url)
@@ -405,7 +410,13 @@ class Module:
                for_test=False,
                trainable=False,
                regularizer=None,
+                max_seq_len=128,
                learning_rate=1e-3):
+        """
+        Args:
+            max_seq_len(int): maximum sequence length, this option is only
+            available for BERT/ERNIE module
+        """

        assert sign_name in self.signatures, "module did not have a signature with name %s" % sign_name
        signature = self.signatures[sign_name]
@@ -444,11 +455,32 @@ class Module:
            if key:
                fetch_dict[key] = program.global_block().var(var.name)

+        # TODO(ZeyuChen) encapsulate into a funtion
+        # update BERT/ERNIE's input tensor's sequence length to max_seq_len
+        if self.name.startswith("bert") or self.name.startswith("ernie"):
+            print("module_name", self.name)
+            MAX_SEQ_LENGTH = 512
+            if max_seq_len > MAX_SEQ_LENGTH or max_seq_len <= 0:
+                raise ValueError(
+                    "max_seq_len({}) should be in the range of [1, {}]".format(
+                        MAX_SEQ_LENGTH))
+            logger.info(
+                "update maximum sequence length of input tensor to {}".format(
+                    max_seq_len))
+            for tensor_name in [
+                    "input_ids", "position_ids", "segment_ids", "input_mask"
+            ]:
+                seq_tensor_shape = [-1, max_seq_len, 1]
+                logger.info("The shape of input tensor[{}] set to {}".format(
+                    tensor_name, seq_tensor_shape))
+                program.global_block().var(
+                    feed_dict[tensor_name].name).desc.set_shape(
+                        seq_tensor_shape)
+
        # record num parameters loaded by paddlehub
        num_param_loaded = 0
        for param in program.global_block().iter_parameters():
            num_param_loaded += 1
-            # logger.debug("%s %s" % (param.name, param.optimize_attr))
        logger.info(
            "%d pretrained paramaters loaded by PaddleHub" % num_param_loaded)


--- a/paddle_hub/version.py
+++ b/paddle_hub/version.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Store PaddleHub version string """
-hub_version = "0.2.1.alpha"
+hub_version = "0.3.0.alpha"
 module_proto_version = "0.1.0"