update autofinetune md

8d056a40 · zhangxuefei · 95243192 · 8d056a40 · 8d056a40 · 8d056a40
Showing with 284 addition and 157 deletion

tutorial/autofinetune-cv.md tutorial/autofinetune-cv.md +129 -0

tutorial/autofinetune-nlp.md tutorial/autofinetune-nlp.md +153 -0

tutorial/autofinetune.md tutorial/autofinetune.md +2 -157

未找到文件。
--- a/tutorial/autofinetune-cv.md
+++ b/tutorial/autofinetune-cv.md
+# PaddleHub 超参优化（Auto Fine-tune）——CV图像分类任务
+
+
+使用PaddleHub Auto Fine-tune必须准备两个文件，并且这两个文件需要按照指定的格式书写。这两个文件分别是需要Fine-tune的python脚本finetuee.py和需要优化的超参数信息yaml文件hparam.yaml。
+
+以Fine-tune图像分类任务为例，我们展示如何利用PaddleHub Auto Finetune进行超参优化。
+
+以下是待优化超参数的yaml文件hparam.yaml，包含需要搜素的超参名字、类型、范围等信息。其中类型只支持float和int类型
+```
+param_list:
+- name : learning_rate
+  init_value : 0.001
+  type : float
+  lower_than : 0.05
+  greater_than : 0.00005
+- name : batch_size
+  init_value : 12
+  type : int
+  lower_than : 20
+  greater_than : 10
+```
+
+**NOTE:** 该yaml文件的最外层级的key必须是param_list
+
+
+以下是图像分类的finetunee.py
+
+```python
+# coding:utf-8
+import argparse
+import os
+import ast
+import shutil
+
+import paddle.fluid as fluid
+import paddlehub as hub
+import numpy as np
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--num_epoch",          type=int,               default=1,                         help="Number of epoches for fine-tuning.")
+parser.add_argument("--use_gpu",            type=ast.literal_eval,  default=True,                      help="Whether use GPU for fine-tuning.")
+parser.add_argument("--checkpoint_dir",     type=str,               default=None,                      help="Path to save log data.")
+parser.add_argument("--batch_size",         type=int,               default=16,                        help="Total examples' number in batch for training.")
+parser.add_argument("--saved_params_dir",   type=str,               default="",                        help="Directory for saving model")
+parser.add_argument("--learning_rate",      type=float,             default=1e-4,                      help="learning_rate.")
+parser.add_argument("--model_path",         type=str,               default="",                        help="load model path")
+# yapf: enable.
+
+
+def is_path_valid(path):
+    if path == "":
+        return False
+    path = os.path.abspath(path)
+    dirname = os.path.dirname(path)
+    if not os.path.exists(dirname):
+        os.mkdir(dirname)
+    return True
+
+def finetune(args):
+    module = hub.Module(name="resnet_v2_50_imagenet")
+    input_dict, output_dict, program = module.context(trainable=True)
+
+    dataset = hub.dataset.Flowers()
+
+    data_reader = hub.reader.ImageClassificationReader(
+        image_width=module.get_expected_image_width(),
+        image_height=module.get_expected_image_height(),
+        images_mean=module.get_pretrained_images_mean(),
+        images_std=module.get_pretrained_images_std(),
+        dataset=dataset)
+
+    feature_map = output_dict["feature_map"]
+
+    img = input_dict["image"]
+    feed_list = [img.name]
+
+    # Select finetune strategy, setup config and finetune
+    strategy = hub.DefaultFinetuneStrategy(
+        learning_rate=args.learning_rate)
+
+    config = hub.RunConfig(
+        use_cuda=True,
+        num_epoch=args.num_epoch,
+        batch_size=args.batch_size,
+        checkpoint_dir=args.checkpoint_dir,
+        strategy=strategy)
+
+    task = hub.ImageClassifierTask(
+        data_reader=data_reader,
+        feed_list=feed_list,
+        feature=feature_map,
+        num_classes=dataset.num_labels,
+        config=config)
+
+    # Load model from the defined model path or not
+    if args.model_path != "":
+        with task.phase_guard(phase="train"):
+            task.init_if_necessary()
+            task.load_parameters(args.model_path)
+            logger.info("PaddleHub has loaded model from %s" % args.model_path)
+
+
+    task.finetune()
+    run_states = task.eval()
+    eval_avg_score, eval_avg_loss, eval_run_speed = task._calculate_metrics(run_states)
+
+    # Move ckpt/best_model to the defined saved parameters directory
+    if is_path_valid(args.saved_params_dir) and os.path.exists(config.checkpoint_dir+"/best_model/"):
+        shutil.copytree(config.checkpoint_dir+"/best_model/", args.saved_params_dir)
+        shutil.rmtree(config.checkpoint_dir)
+
+    print("AutoFinetuneEval"+"\t"+str(float(eval_avg_score["acc"])))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    finetune(args)
+```
+**Note**:以上是finetunee.py的写法。
+> finetunee.py必须可以接收待优化超参数选项参数, 并且待搜素超参数选项名字和yaml文件中的超参数名字保持一致。
+
+> finetunee.py必须有saved_params_dir这个选项。
+
+> PaddleHub Auto Fine-tune超参评估策略选择为ModelBased，finetunee.py必须有model_path选项。
+
+> PaddleHub Auto Fine-tune优化超参策略选择hazero时，必须提供两个以上的待优化超参。
+
+> finetunee.py必须输出模型在数据集dev上的评价效果，同时以“AutoFinetuneEval"开始，和评价效果之间以“\t”分开，如print("AutoFinetuneEval"+"\t"+str(float(eval_avg_score["acc"])))。
--- a/tutorial/autofinetune-nlp.md
+++ b/tutorial/autofinetune-nlp.md
+# PaddleHub 超参优化（Auto Fine-tune）——NLP情感分类任务
+
+使用PaddleHub Auto Fine-tune必须准备两个文件，并且这两个文件需要按照指定的格式书写。这两个文件分别是需要Fine-tune的python脚本finetuee.py和需要优化的超参数信息yaml文件hparam.yaml。
+
+以Fine-tune中文情感分类任务为例，我们展示如何利用PaddleHub Auto Finetune进行超参优化。
+
+以下是待优化超参数的yaml文件hparam.yaml，包含需要搜素的超参名字、类型、范围等信息。其中类型只支持float和int类型
+```
+param_list:
+- name : learning_rate
+  init_value : 0.001
+  type : float
+  lower_than : 0.05
+  greater_than : 0.000005
+- name : weight_decay
+  init_value : 0.1
+  type : float
+  lower_than : 1
+  greater_than : 0.0
+- name : batch_size
+  init_value : 32
+  type : int
+  lower_than : 40
+  greater_than : 30
+- name : warmup_prop
+  init_value : 0.1
+  type : float
+  lower_than : 0.2
+  greater_than : 0.0
+```
+
+**NOTE:** 该yaml文件的最外层级的key必须是param_list
+
+以下是中文情感分类的finetunee.py
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import ast
+
+import paddle.fluid as fluid
+import paddlehub as hub
+import os
+from paddlehub.common.logger import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--epochs", type=int, default=3, help="epochs.")
+parser.add_argument("--batch_size", type=int, default=32, help="batch_size.")
+parser.add_argument("--learning_rate", type=float, default=5e-5, help="learning_rate.")
+parser.add_argument("--warmup_prop", type=float, default=0.1, help="warmup_prop.")
+parser.add_argument("--weight_decay", type=float, default=0.01, help="weight_decay.")
+parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
+parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
+parser.add_argument("--saved_params_dir", type=str, default="", help="Directory for saving model during ")
+parser.add_argument("--model_path", type=str, default="", help="load model path")
+args = parser.parse_args()
+# yapf: enable.
+
+
+def is_path_valid(path):
+    if path == "":
+        return False
+    path = os.path.abspath(path)
+    dirname = os.path.dirname(path)
+    if not os.path.exists(dirname):
+        os.mkdir(dirname)
+    return True
+
+if __name__ == '__main__':
+    # Load Paddlehub ERNIE pretrained model
+    module = hub.Module(name="ernie")
+    inputs, outputs, program = module.context(
+        trainable=True, max_seq_len=args.max_seq_len)
+
+    # Download dataset and use ClassifyReader to read dataset
+    dataset = hub.dataset.ChnSentiCorp()
+    metrics_choices = ["acc"]
+
+    reader = hub.reader.ClassifyReader(
+        dataset=dataset,
+        vocab_path=module.get_vocab_path(),
+        max_seq_len=args.max_seq_len)
+
+    # Construct transfer learning network
+    # Use "pooled_output" for classification tasks on an entire sentence.
+    pooled_output = outputs["pooled_output"]
+
+    # Setup feed list for data feeder
+    # Must feed all the tensor of ERNIE's module need
+    feed_list = [
+        inputs["input_ids"].name,
+        inputs["position_ids"].name,
+        inputs["segment_ids"].name,
+        inputs["input_mask"].name,
+    ]
+
+    # Select finetune strategy, setup config and finetune
+    strategy = hub.AdamWeightDecayStrategy(
+        warmup_proportion=args.warmup_prop,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        lr_scheduler="linear_decay")
+
+    # Setup runing config for PaddleHub Finetune API
+    config = hub.RunConfig(
+        checkpoint_dir=args.checkpoint_dir,
+        use_cuda=True,
+        num_epoch=args.epochs,
+        batch_size=args.batch_size,
+        enable_memory_optim=True,
+        strategy=strategy)
+
+    # Define a classfication finetune task by PaddleHub's API
+    cls_task = hub.TextClassifierTask(
+        data_reader=reader,
+        feature=pooled_output,
+        feed_list=feed_list,
+        num_classes=dataset.num_labels,
+        config=config,
+        metrics_choices=metrics_choices)
+
+    # Load model from the defined model path or not
+    if args.model_path != "":
+        with cls_task.phase_guard(phase="train"):
+            cls_task.init_if_necessary()
+            cls_task.load_parameters(args.model_path)
+            logger.info("PaddleHub has loaded model from %s" % args.model_path)
+
+    cls_task.finetune()
+    run_states = cls_task.eval()
+    eval_avg_score, eval_avg_loss, eval_run_speed = cls_task._calculate_metrics(run_states)
+
+    # Move ckpt/best_model to the defined saved parameters directory
+    if is_path_valid(args.saved_params_dir) and os.path.exists(config.checkpoint_dir+"/best_model/"):
+        shutil.copytree(config.checkpoint_dir+"/best_model/", args.saved_params_dir)
+        shutil.rmtree(config.checkpoint_dir)
+
+    print("AutoFinetuneEval"+"\t"+str(float(eval_avg_score["acc"])))
+```
+**Note**:以上是finetunee.py的写法。
+> finetunee.py必须可以接收待优化超参数选项参数, 并且待搜素超参数选项名字和yaml文件中的超参数名字保持一致。
+
+> finetunee.py必须有saved_params_dir这个选项。
+
+> PaddleHub Auto Fine-tune超参评估策略选择为ModelBased，finetunee.py必须有model_path选项。
+
+> PaddleHub Auto Fine-tune优化超参策略选择hazero时，必须提供两个以上的待优化超参。
+
+> finetunee.py必须输出模型在数据集dev上的评价效果，同时以“AutoFinetuneEval"开始，和评价效果之间以“\t”分开，如print("AutoFinetuneEval"+"\t"+str(float(eval_avg_score["acc"])))。
--- a/tutorial/autofinetune.md
+++ b/tutorial/autofinetune.md
@@ -10,11 +10,6 @@ PaddleHub Auto Fine-tune提供两种超参优化策略：

 * PSHE2: 采用粒子群算法，最优超参数组合就是所求问题的解。现在想求得最优解就是要找到更新超参数组合，即如何更新超参数，才能让算法更快更好的收敛到最优解。PSE2算法根据超参数本身历史的最优，在一定随机扰动的情况下决定下一步的更新方向。

-
-
-
-
-
 PaddleHub Auto Fine-tune提供两种超参评估策略：

 * FullTrail: 给定一组超参，利用这组超参从头开始Finetune一个新模型，之后在数据集dev部分评估这个模型
@@ -25,158 +20,8 @@ PaddleHub Auto Fine-tune提供两种超参评估策略：

 使用PaddleHub Auto Fine-tune必须准备两个文件，并且这两个文件需要按照指定的格式书写。这两个文件分别是需要Fine-tune的python脚本finetuee.py和需要优化的超参数信息yaml文件hparam.yaml。

-以Fine-tune中文情感分类任务为例，我们展示如何利用PaddleHub Auto Finetune进行超参优化。
-
-以下是待优化超参数的yaml文件hparam.yaml，包含需要搜素的超参名字、类型、范围等信息。其中类型只支持float和int类型
-```
-param_list:
- name : learning_rate
-  init_value : 0.001
-  type : float
-  lower_than : 0.05
-  greater_than : 0.000005
- name : weight_decay
-  init_value : 0.1
-  type : float
-  lower_than : 1
-  greater_than : 0.0
- name : batch_size
-  init_value : 32
-  type : int
-  lower_than : 40
-  greater_than : 30
- name : warmup_prop
-  init_value : 0.1
-  type : float
-  lower_than : 0.2
-  greater_than : 0.0
-```
-
-**NOTE:** 该yaml文件的最外层级的key必须是param_list
-
-
-以下是中文情感分类的finetunee.py
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import ast
-
-import paddle.fluid as fluid
-import paddlehub as hub
-import os
-from paddlehub.common.logger import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--epochs", type=int, default=3, help="epochs.")
-parser.add_argument("--batch_size", type=int, default=32, help="batch_size.")
-parser.add_argument("--learning_rate", type=float, default=5e-5, help="learning_rate.")
-parser.add_argument("--warmup_prop", type=float, default=0.1, help="warmup_prop.")
-parser.add_argument("--weight_decay", type=float, default=0.01, help="weight_decay.")
-parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
-parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
-parser.add_argument("--saved_params_dir", type=str, default="", help="Directory for saving model during ")
-parser.add_argument("--model_path", type=str, default="", help="load model path")
-args = parser.parse_args()
-# yapf: enable.
-
-
-def is_path_valid(path):
-    if path == "":
-        return False
-    path = os.path.abspath(path)
-    dirname = os.path.dirname(path)
-    if not os.path.exists(dirname):
-        os.mkdir(dirname)
-    return True
-
-if __name__ == '__main__':
-    # Load Paddlehub ERNIE pretrained model
-    module = hub.Module(name="ernie")
-    inputs, outputs, program = module.context(
-        trainable=True, max_seq_len=args.max_seq_len)
-
-    # Download dataset and use ClassifyReader to read dataset
-    dataset = hub.dataset.ChnSentiCorp()
-    metrics_choices = ["acc"]
-
-    reader = hub.reader.ClassifyReader(
-        dataset=dataset,
-        vocab_path=module.get_vocab_path(),
-        max_seq_len=args.max_seq_len)
-
-    # Construct transfer learning network
-    # Use "pooled_output" for classification tasks on an entire sentence.
-    pooled_output = outputs["pooled_output"]
-
-    # Setup feed list for data feeder
-    # Must feed all the tensor of ERNIE's module need
-    feed_list = [
-        inputs["input_ids"].name,
-        inputs["position_ids"].name,
-        inputs["segment_ids"].name,
-        inputs["input_mask"].name,
-    ]
-
-    # Select finetune strategy, setup config and finetune
-    strategy = hub.AdamWeightDecayStrategy(
-        warmup_proportion=args.warmup_prop,
-        learning_rate=args.learning_rate,
-        weight_decay=args.weight_decay,
-        lr_scheduler="linear_decay")
-
-    # Setup runing config for PaddleHub Finetune API
-    config = hub.RunConfig(
-        checkpoint_dir=args.checkpoint_dir,
-        use_cuda=True,
-        num_epoch=args.epochs,
-        batch_size=args.batch_size,
-        enable_memory_optim=True,
-        strategy=strategy)
-
-    # Define a classfication finetune task by PaddleHub's API
-    cls_task = hub.TextClassifierTask(
-        data_reader=reader,
-        feature=pooled_output,
-        feed_list=feed_list,
-        num_classes=dataset.num_labels,
-        config=config,
-        metrics_choices=metrics_choices)
-
-    # Load model from the defined model path or not
-    if args.model_path != "":
-        with cls_task.phase_guard(phase="train"):
-            cls_task.init_if_necessary()
-            cls_task.load_parameters(args.model_path)
-            logger.info("PaddleHub has loaded model from %s" % args.model_path)
-
-    cls_task.finetune()
-    run_states = cls_task.eval()
-    eval_avg_score, eval_avg_loss, eval_run_speed = cls_task._calculate_metrics(run_states)
-
-    # Move ckpt/best_model to the defined saved parameters directory
-    if is_path_valid(args.saved_params_dir) and os.path.exists(config.checkpoint_dir+"/best_model/"):
-        shutil.copytree(config.checkpoint_dir+"/best_model/", args.saved_params_dir)
-        shutil.rmtree(config.checkpoint_dir)
-
-    print("AutoFinetuneEval"+"\t"+str(float(eval_avg_score["acc"])))
-```
-**Note**:以上是finetunee.py的写法。
-> finetunee.py必须可以接收待优化超参数选项参数, 并且待搜素超参数选项名字和yaml文件中的超参数名字保持一致。
-
-> finetunee.py必须有saved_params_dir这个选项。
-
-> PaddleHub Auto Fine-tune超参评估策略选择为ModelBased，finetunee.py必须有model_path选项。
-
-> PaddleHub Auto Fine-tune优化超参策略选择hazero时，必须提供两个以上的待优化超参。
-
-> finetunee.py必须输出模型在数据集dev上的评价效果，同时以“AutoFinetuneEval"开始，和评价效果之间以“\t”分开，如print("AutoFinetuneEval"+"\t"+str(float(eval_avg_score["acc"])))。
-
-
+[PaddleHub Auto Fine-tune超参优化--NLP情感分类任务]()
+[PaddleHub Auto Fine-tune超参优化--CV图像分类任务]()

 ## 三、启动方式