diff --git a/core/engine/local_cluster.py b/core/engine/local_cluster.py index cf9b6032162a61b16e4f01552c23cff7312b3965..88f21ef8bf7218a4b83db265ad534ad2266561a9 100755 --- a/core/engine/local_cluster.py +++ b/core/engine/local_cluster.py @@ -119,7 +119,8 @@ class LocalClusterEngine(Engine): "PADDLE_TRAINERS_NUM": str(worker_num), "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(i), - "FLAGS_selected_gpus": str(selected_gpus[i]) + "FLAGS_selected_gpus": str(selected_gpus[i]), + "PADDLEREC_GPU_NUMS": str(selected_gpus_num) }) os.system("mkdir -p {}".format(logs_dir)) diff --git a/core/model.py b/core/model.py index 265f5311d2a49601fb21addc9031358170a287fd..22e742374d4ac2d4ef079b6cb4157759ef3ffd51 100755 --- a/core/model.py +++ b/core/model.py @@ -177,6 +177,13 @@ class ModelBase(object): opt_name = envs.get_global_env("hyper_parameters.optimizer.class") opt_lr = envs.get_global_env( "hyper_parameters.optimizer.learning_rate") + if not isinstance(opt_lr, (float, Variable)): + try: + opt_lr = float(opt_lr) + except ValueError: + raise ValueError( + "In your config yaml, 'learning_rate': %s must be written as a floating piont number,such as 0.001 or 1e-3" + % opt_lr) opt_strategy = envs.get_global_env( "hyper_parameters.optimizer.strategy") diff --git a/core/trainers/framework/dataset.py b/core/trainers/framework/dataset.py index 239b568be34793c5ddb0830e9cca06951da143f4..3861813cdd7d8d5e0b64e8c568a9c70ede2b9c05 100644 --- a/core/trainers/framework/dataset.py +++ b/core/trainers/framework/dataset.py @@ -143,6 +143,8 @@ class QueueDataset(DatasetBase): if need_split_files: file_list = split_files(file_list, context["fleet"].worker_index(), context["fleet"].worker_num()) + + context["file_list"] = file_list print("File_list: {}".format(file_list)) dataset.set_filelist(file_list) diff --git a/core/trainers/framework/runner.py b/core/trainers/framework/runner.py index 79d7be66e58d0c4244980cf4bf871f42984d186e..839e3ed4d6e04b13f69e6c2cfc463e83aef130f7 100644 --- a/core/trainers/framework/runner.py +++ b/core/trainers/framework/runner.py @@ -18,11 +18,18 @@ import os import time import warnings import numpy as np +import random +import json +import logging import paddle.fluid as fluid from paddlerec.core.utils import envs +from paddlerec.core.utils.util import shuffle_files from paddlerec.core.metric import Metric +logging.basicConfig( + format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO) + __all__ = [ "RunnerBase", "SingleRunner", "PSRunner", "CollectiveRunner", "PslibRunner" ] @@ -88,12 +95,12 @@ class RunnerBase(object): reader_name = model_dict["dataset_name"] model_name = model_dict["name"] model_class = context["model"][model_dict["name"]]["model"] - fetch_vars = [] fetch_alias = [] fetch_period = int( envs.get_global_env("runner." + context["runner_name"] + ".print_interval", 20)) + scope = context["model"][model_name]["scope"] program = context["model"][model_name]["main_program"] reader = context["dataset"][reader_name] @@ -133,6 +140,9 @@ class RunnerBase(object): fetch_period = int( envs.get_global_env("runner." + context["runner_name"] + ".print_interval", 20)) + save_step_interval = int( + envs.get_global_env("runner." + context["runner_name"] + + ".save_step_interval", -1)) if context["is_infer"]: metrics = model_class.get_infer_results() else: @@ -140,18 +150,33 @@ class RunnerBase(object): metrics_varnames = [] metrics_format = [] + + if context["is_infer"]: + metrics_format.append("\t[Infer] {}: {{}}".format("batch")) + else: + metrics_format.append("\t[Train]") + if "current_epoch" in context: + metrics_format.append(" epoch: {}".format(context[ + "current_epoch"])) + metrics_format.append(" {}: {{}}".format("batch")) + + metrics_format.append("{}: {{:.2f}}s".format("time_each_interval")) + metrics_names = ["total_batch"] - metrics_format.append("{}: {{}}".format("batch")) + metrics_indexes = dict() for name, var in metrics.items(): metrics_names.append(name) metrics_varnames.append(var.name) + metrics_indexes[var.name] = len(metrics_varnames) - 1 metrics_format.append("{}: {{}}".format(name)) metrics_format = ", ".join(metrics_format) reader = context["model"][model_dict["name"]]["model"]._data_loader reader.start() batch_id = 0 + begin_time = time.time() scope = context["model"][model_name]["scope"] + runner_results = [] result = None with fluid.scope_guard(scope): try: @@ -160,20 +185,61 @@ class RunnerBase(object): program=program, fetch_list=metrics_varnames, return_numpy=False) - metrics = [batch_id] + metrics = [batch_id] metrics_rets = [ as_numpy(metrics_tensor) for metrics_tensor in metrics_tensors ] metrics.extend(metrics_rets) + batch_runner_result = {} + for k, v in metrics_indexes.items(): + batch_runner_result[k] = np.array(metrics_rets[ + v]).tolist() + runner_results.append(batch_runner_result) + if batch_id % fetch_period == 0 and batch_id != 0: - print(metrics_format.format(*metrics)) + end_time = time.time() + seconds = end_time - begin_time + metrics_logging = metrics[:] + metrics_logging.insert(1, seconds) + begin_time = end_time + logging.info(metrics_format.format(*metrics_logging)) + + if save_step_interval >= 1 and batch_id % save_step_interval == 0 and context[ + "is_infer"] == False: + if context["fleet_mode"].upper() == "PS": + train_prog = context["model"][model_dict["name"]][ + "main_program"] + else: + train_prog = context["model"][model_dict["name"]][ + "default_main_program"] + startup_prog = context["model"][model_dict["name"]][ + "startup_program"] + with fluid.program_guard(train_prog, startup_prog): + self.save( + context, + is_fleet=context["is_fleet"], + epoch_id=None, + batch_id=batch_id) + batch_id += 1 except fluid.core.EOFException: reader.reset() + runner_result_save_path = envs.get_global_env( + "runner." + context["runner_name"] + ".runner_result_dump_path", + None) + if runner_result_save_path: + if "current_epoch" in context: + runner_result_save_path = runner_result_save_path + "_epoch_{}".format( + context["current_epoch"]) + logging.info("Dump runner result in {}".format( + runner_result_save_path)) + with open(runner_result_save_path, 'w+') as fout: + json.dump(runner_results, fout) + if batch_id > 0: result = dict(zip(metrics_names, metrics)) return result @@ -270,7 +336,7 @@ class RunnerBase(object): exec_strategy=_exe_strategy) return program - def save(self, epoch_id, context, is_fleet=False): + def save(self, context, is_fleet=False, epoch_id=None, batch_id=None): def need_save(epoch_id, epoch_interval, is_last=False): name = "runner." + context["runner_name"] + "." total_epoch = int(envs.get_global_env(name + "epochs", 1)) @@ -327,7 +393,8 @@ class RunnerBase(object): assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) - + logging.info("\tsave epoch_id:%d model into: \"%s\"" % + (epoch_id, dirname)) if is_fleet: warnings.warn( "Save inference model in cluster training is not recommended! Using save checkpoint instead.", @@ -350,14 +417,35 @@ class RunnerBase(object): if dirname is None or dirname == "": return dirname = os.path.join(dirname, str(epoch_id)) + logging.info("\tsave epoch_id:%d model into: \"%s\"" % + (epoch_id, dirname)) + if is_fleet: + if context["fleet"].worker_index() == 0: + context["fleet"].save_persistables(context["exe"], dirname) + else: + fluid.io.save_persistables(context["exe"], dirname) + + def save_checkpoint_step(): + name = "runner." + context["runner_name"] + "." + save_interval = int( + envs.get_global_env(name + "save_step_interval", -1)) + dirname = envs.get_global_env(name + "save_step_path", None) + if dirname is None or dirname == "": + return + dirname = os.path.join(dirname, str(batch_id)) + logging.info("\tsave batch_id:%d model into: \"%s\"" % + (batch_id, dirname)) if is_fleet: if context["fleet"].worker_index() == 0: context["fleet"].save_persistables(context["exe"], dirname) else: fluid.io.save_persistables(context["exe"], dirname) - save_persistables() - save_inference_model() + if isinstance(epoch_id, int): + save_persistables() + save_inference_model() + if isinstance(batch_id, int): + save_checkpoint_step() class SingleRunner(RunnerBase): @@ -376,7 +464,13 @@ class SingleRunner(RunnerBase): for model_dict in context["phases"]: model_class = context["model"][model_dict["name"]]["model"] metrics = model_class._metrics - + if "shuffle_filelist" in model_dict: + need_shuffle_files = model_dict.get("shuffle_filelist", + None) + filelist = context["file_list"] + context["file_list"] = shuffle_files(need_shuffle_files, + filelist) + context["current_epoch"] = epoch begin_time = time.time() result = self._run(context, model_dict) end_time = time.time() @@ -403,7 +497,7 @@ class SingleRunner(RunnerBase): startup_prog = context["model"][model_dict["name"]][ "startup_program"] with fluid.program_guard(train_prog, startup_prog): - self.save(epoch, context) + self.save(context=context, epoch_id=epoch) context["status"] = "terminal_pass" @@ -420,6 +514,12 @@ class PSRunner(RunnerBase): model_class = context["model"][model_dict["name"]]["model"] metrics = model_class._metrics for epoch in range(epochs): + if "shuffle_filelist" in model_dict: + need_shuffle_files = model_dict.get("shuffle_filelist", None) + filelist = context["file_list"] + context["file_list"] = shuffle_files(need_shuffle_files, + filelist) + context["current_epoch"] = epoch begin_time = time.time() result = self._run(context, model_dict) end_time = time.time() @@ -450,7 +550,7 @@ class PSRunner(RunnerBase): startup_prog = context["model"][model_dict["name"]][ "startup_program"] with fluid.program_guard(train_prog, startup_prog): - self.save(epoch, context, True) + self.save(context=context, is_fleet=True, epoch_id=epoch) context["status"] = "terminal_pass" @@ -465,6 +565,12 @@ class CollectiveRunner(RunnerBase): ".epochs")) model_dict = context["env"]["phase"][0] for epoch in range(epochs): + if "shuffle_filelist" in model_dict: + need_shuffle_files = model_dict.get("shuffle_filelist", None) + filelist = context["file_list"] + context["file_list"] = shuffle_files(need_shuffle_files, + filelist) + context["current_epoch"] = epoch begin_time = time.time() self._run(context, model_dict) end_time = time.time() @@ -477,7 +583,7 @@ class CollectiveRunner(RunnerBase): startup_prog = context["model"][model_dict["name"]][ "startup_program"] with fluid.program_guard(train_prog, startup_prog): - self.save(epoch, context, True) + self.save(context=context, is_fleet=True, epoch_id=epoch) context["status"] = "terminal_pass" @@ -493,6 +599,12 @@ class PslibRunner(RunnerBase): envs.get_global_env("runner." + context["runner_name"] + ".epochs")) for epoch in range(epochs): + if "shuffle_filelist" in model_dict: + need_shuffle_files = model_dict.get("shuffle_filelist", None) + filelist = context["file_list"] + context["file_list"] = shuffle_files(need_shuffle_files, + filelist) + context["current_epoch"] = epoch begin_time = time.time() self._run(context, model_dict) end_time = time.time() @@ -555,6 +667,12 @@ class SingleInferRunner(RunnerBase): metrics = model_class._infer_results self._load(context, model_dict, self.epoch_model_path_list[index]) + if "shuffle_filelist" in model_dict: + need_shuffle_files = model_dict.get("shuffle_filelist", + None) + filelist = context["file_list"] + context["file_list"] = shuffle_files(need_shuffle_files, + filelist) begin_time = time.time() result = self._run(context, model_dict) end_time = time.time() diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index 03e6f0a67884917e9af2d02d13eb86576620ceef..69d3d3eb7c0ea5be8d2efa45d8e6abad356a2f64 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -14,6 +14,7 @@ from __future__ import print_function import os +import warnings from paddlerec.core.utils.envs import lazy_instance_by_fliename from paddlerec.core.utils.envs import get_global_env from paddlerec.core.utils.envs import get_runtime_environ @@ -47,6 +48,16 @@ def dataloader_by_name(readerclass, files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ: + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + warnings.warn( + "Because files cannot be divided equally between GPUs,discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process @@ -59,7 +70,7 @@ def dataloader_by_name(readerclass, if need_split_files: files = split_files(files, context["fleet"].worker_index(), context["fleet"].worker_num()) - + context["file_list"] = files reader = reader_class(yaml_file) reader.init() @@ -109,6 +120,16 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ: + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + warnings.warn( + "Because files cannot be divided equally between GPUs,discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process @@ -121,7 +142,7 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): if need_split_files: files = split_files(files, context["fleet"].worker_index(), context["fleet"].worker_num()) - + context["file_list"] = files sparse = get_global_env(name + "sparse_slots", "#") if sparse == "": sparse = "#" @@ -153,73 +174,3 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): if hasattr(reader, 'generate_batch_from_trainfiles'): return gen_batch_reader() return gen_reader - - -def slotdataloader(readerclass, train, yaml_file, context): - if train == "TRAIN": - reader_name = "SlotReader" - namespace = "train.reader" - data_path = get_global_env("train_data_path", None, namespace) - else: - reader_name = "SlotReader" - namespace = "evaluate.reader" - data_path = get_global_env("test_data_path", None, namespace) - - if data_path.startswith("paddlerec::"): - package_base = get_runtime_environ("PACKAGE_BASE") - assert package_base is not None - data_path = os.path.join(package_base, data_path.split("::")[1]) - - hidden_file_list, files = check_filelist( - hidden_file_list=[], data_file_list=[], train_data_path=data_path) - if (hidden_file_list is not None): - print( - "Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}". - format(hidden_file_list)) - - files.sort() - - need_split_files = False - if context["engine"] == EngineMode.LOCAL_CLUSTER: - # for local cluster: split files for multi process - need_split_files = True - elif context["engine"] == EngineMode.CLUSTER and context[ - "cluster_type"] == "K8S": - # for k8s mount mode, split files for every node - need_split_files = True - - if need_split_files: - files = split_files(files, context["fleet"].worker_index(), - context["fleet"].worker_num()) - - sparse = get_global_env("sparse_slots", "#", namespace) - if sparse == "": - sparse = "#" - dense = get_global_env("dense_slots", "#", namespace) - if dense == "": - dense = "#" - padding = get_global_env("padding", 0, namespace) - reader = SlotReader(yaml_file) - reader.init(sparse, dense, int(padding)) - - def gen_reader(): - for file in files: - with open(file, 'r') as f: - for line in f: - line = line.rstrip('\n') - iter = reader.generate_sample(line) - for parsed_line in iter(): - if parsed_line is None: - continue - else: - values = [] - for pased in parsed_line: - values.append(pased[1]) - yield values - - def gen_batch_reader(): - return reader.generate_batch_from_trainfiles(files) - - if hasattr(reader, 'generate_batch_from_trainfiles'): - return gen_batch_reader() - return gen_reader diff --git a/core/utils/envs.py b/core/utils/envs.py index ddcc9a94b3adc47cda2023c4d9e196b9fb16faeb..6c2494a903ad821fecf4e3a5786606730e725ba6 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -20,7 +20,7 @@ import socket import sys import six import traceback -import six +import warnings global_envs = {} global_envs_flatten = {} @@ -98,6 +98,25 @@ def set_global_envs(envs): value = os_path_adapter(workspace_adapter(value)) global_envs[name] = value + for runner in envs["runner"]: + if "save_step_interval" in runner or "save_step_path" in runner: + phase_name = runner["phases"] + phase = [ + phase for phase in envs["phase"] + if phase["name"] == phase_name[0] + ] + dataset_name = phase[0].get("dataset_name") + dataset = [ + dataset for dataset in envs["dataset"] + if dataset["name"] == dataset_name + ] + if dataset[0].get("type") == "QueueDataset": + runner["save_step_interval"] = None + runner["save_step_path"] = None + warnings.warn( + "QueueDataset can not support save by step, please not config save_step_interval and save_step_path in your yaml" + ) + if get_platform() != "LINUX": for dataset in envs["dataset"]: name = ".".join(["dataset", dataset["name"], "type"]) diff --git a/core/utils/util.py b/core/utils/util.py index f6acfe203612326a77f41326581583278dac4183..09aece5e899c7eab3f71a5ac84d430c54274bf06 100755 --- a/core/utils/util.py +++ b/core/utils/util.py @@ -16,6 +16,8 @@ import datetime import os import sys import time +import warnings +import random import numpy as np from paddle import fluid @@ -223,6 +225,16 @@ def check_filelist(hidden_file_list, data_file_list, train_data_path): return hidden_file_list, data_file_list +def shuffle_files(need_shuffle_files, filelist): + if not isinstance(need_shuffle_files, bool): + raise ValueError( + "In your config yaml, 'shuffle_filelist': %s must be written as a boolean type,such as True or False" + % need_shuffle_files) + elif need_shuffle_files: + random.shuffle(filelist) + return filelist + + class CostPrinter(object): """ For count cost time && print cost log diff --git a/doc/pre_train_model.md b/doc/pre_train_model.md index 134710a430992cc756cd37fcc1e01ee3aef2dfb1..42b5de4ab4029c4a5464bbcb3c3ecaed2253aa3b 100644 --- a/doc/pre_train_model.md +++ b/doc/pre_train_model.md @@ -7,9 +7,27 @@ PaddleRec基于业务实践,使用真实数据,产出了推荐领域算法 ### 获取地址 ```bash -wget xxx.tar.gz +wget https://paddlerec.bj.bcebos.com/textcnn_pretrain%2Fpretrain_model.tar.gz ``` ### 使用方法 -解压后,得到的是一个paddle的模型文件夹,使用`PaddleRec/models/contentunderstanding/classification_finetue`模型进行加载 +解压后,得到的是一个paddle的模型文件夹,使用`PaddleRec/models/contentunderstanding/textcnn`模型进行加载 +您可以在PaddleRec/models/contentunderstanding/textcnn_pretrain中找到finetune_startup.py文件,在config.yaml中配置startup_class_path和init_pretraining_model_path两个参数。 +在参数startup_class_path中配置finetune_startup.py文件的地址,在init_pretraining_model_path参数中配置您要加载的参数文件。 +以textcnn_pretrain为例,配置完的runner如下: +``` +runner: +- name: train_runner + class: train + epochs: 6 + device: cpu + save_checkpoint_interval: 1 + save_checkpoint_path: "increment" + init_model_path: "" + print_interval: 10 + startup_class_path: "{workspace}/finetune_startup.py" + init_pretraining_model_path: "{workspace}/pretrain_model/pretrain_model_params" + phases: phase_train +``` +具体使用方法请参照textcnn[使用预训练模型进行finetune](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/contentunderstanding/textcnn_pretrain) diff --git a/doc/train.md b/doc/train.md index 16fad1b23783b5fe0c2a785f5500ba88c42ae356..b275b66f3b0ec7c88424c4c18afd855182136c41 100644 --- a/doc/train.md +++ b/doc/train.md @@ -20,7 +20,7 @@ python -m paddlerec.run -m paddlerec.models.xxx.yyy 例如启动`recall`下的`word2vec`模型的默认配置; ```shell -python -m paddlerec.run -m models/recall/word2vec +python -m paddlerec.run -m models/recall/word2vec/config.yaml ``` ### 2. 启动内置模型的个性化配置训练 diff --git a/doc/yaml.md b/doc/yaml.md index c96b3ee47ad56872d5d85fa6d674887ca083cf82..541b817d123ddd897b2056bc43b25d7aee78ddc2 100644 --- a/doc/yaml.md +++ b/doc/yaml.md @@ -27,6 +27,8 @@ | init_model_path | string | 路径 | 否 | 初始化模型地址 | | save_checkpoint_interval | int | >= 1 | 否 | Save参数的轮数间隔 | | save_checkpoint_path | string | 路径 | 否 | Save参数的地址 | +| save_step_interval | int | >= 1 | 否 | Step save参数的batch数间隔 | +| save_step_path | string | 路径 | 否 | Step save参数的地址 | | save_inference_interval | int | >= 1 | 否 | Save预测模型的轮数间隔 | | save_inference_path | string | 路径 | 否 | Save预测模型的地址 | | save_inference_feed_varnames | list[string] | 组网中指定Variable的name | 否 | 预测模型的入口变量name | @@ -37,6 +39,9 @@ | startup_class_path | string | 路径 | 否 | 自定义startup流程实现的地址 | | runner_class_path | string | 路径 | 否 | 自定义runner流程实现的地址 | | terminal_class_path | string | 路径 | 否 | 自定义terminal流程实现的地址 | +| init_pretraining_model_path | string | 路径 | 否 |自定义的startup流程中需要传入这个参数,finetune中需要加载的参数的地址 | +| runner_result_dump_path | string | 路径 | 否 | 运行中metrics的结果使用json.dump到文件的地址,若是在训练的runner中使用, 会自动加上epoch后缀 | + diff --git a/models/contentunderstanding/readme.md b/models/contentunderstanding/readme.md index 91d2a2db335fb45ef04d62db13f692aff5b2e500..cf6543733893bc3813779d7b7ed523c3c137767d 100644 --- a/models/contentunderstanding/readme.md +++ b/models/contentunderstanding/readme.md @@ -1,7 +1,7 @@ # 内容理解模型库 ## 简介 -我们提供了常见的内容理解任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的内容理解模型包括 [Tagspace](tagspace)、[文本分类](classification)等。 +我们提供了常见的内容理解任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的内容理解模型包括 [Tagspace](tagspace)、[文本分类](textcnn)、[基于textcnn的预训练模型](textcnn_pretrain)等。 模型算法库在持续添加中,欢迎关注。 @@ -23,7 +23,7 @@ | 模型 | 简介 | 论文 | | :------------------: | :--------------------: | :---------: | | TagSpace | 标签推荐 | [EMNLP 2014][TagSpace: Semantic Embeddings from Hashtags](https://www.aclweb.org/anthology/D14-1194.pdf) | -| Classification | 文本分类 | [EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf) | +| textcnn | 文本分类 | [EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf) | 下面是每个模型的简介(注:图片引用自链接中的论文) @@ -32,7 +32,7 @@

-[文本分类CNN模型](https://www.aclweb.org/anthology/D14-1181.pdf) +[textCNN模型](https://www.aclweb.org/anthology/D14-1181.pdf)

@@ -42,7 +42,7 @@ git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec cd PaddleRec python -m paddlerec.run -m models/contentunderstanding/tagspace/config.yaml -python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml +python -m paddlerec.run -m models/contentunderstanding/textcnn/config.yaml ``` ## 使用教程(复现论文) @@ -134,7 +134,7 @@ batch: 13, acc: [0.928], loss: [0.01736144] batch: 14, acc: [0.93], loss: [0.01911209] ``` -**(2)Classification** +**(2)textcnn** ### 数据处理 情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控,为企业提供有利的决策支持。 @@ -206,4 +206,4 @@ batch: 3, acc: [0.90234375], loss: [0.27907994] | 数据集 | 模型 | loss | acc | | :------------------: | :--------------------: | :---------: |:---------: | | ag news dataset | TagSpace | 0.0198 | 0.9177 | -| ChnSentiCorp | Classification | 0.2282 | 0.9127 | +| ChnSentiCorp | textcnn | 0.2282 | 0.9127 | diff --git a/models/contentunderstanding/classification/__init__.py b/models/contentunderstanding/textcnn/__init__.py similarity index 100% rename from models/contentunderstanding/classification/__init__.py rename to models/contentunderstanding/textcnn/__init__.py diff --git a/models/contentunderstanding/classification/config.yaml b/models/contentunderstanding/textcnn/config.yaml similarity index 97% rename from models/contentunderstanding/classification/config.yaml rename to models/contentunderstanding/textcnn/config.yaml index 70a439afbf1348621f64a810e61b4c504ad91012..c52e98c9500a5619e7748855e29a8545b74217b1 100644 --- a/models/contentunderstanding/classification/config.yaml +++ b/models/contentunderstanding/textcnn/config.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -workspace: "models/contentunderstanding/classification" +workspace: "models/contentunderstanding/textcnn" dataset: - name: data1 diff --git a/models/contentunderstanding/classification/data/preprocess.py b/models/contentunderstanding/textcnn/data/preprocess.py similarity index 100% rename from models/contentunderstanding/classification/data/preprocess.py rename to models/contentunderstanding/textcnn/data/preprocess.py diff --git a/models/contentunderstanding/classification/data/test/test.txt b/models/contentunderstanding/textcnn/data/test/test.txt similarity index 100% rename from models/contentunderstanding/classification/data/test/test.txt rename to models/contentunderstanding/textcnn/data/test/test.txt diff --git a/models/contentunderstanding/classification/data/train/train.txt b/models/contentunderstanding/textcnn/data/train/train.txt similarity index 100% rename from models/contentunderstanding/classification/data/train/train.txt rename to models/contentunderstanding/textcnn/data/train/train.txt diff --git a/models/contentunderstanding/classification/model.py b/models/contentunderstanding/textcnn/model.py similarity index 100% rename from models/contentunderstanding/classification/model.py rename to models/contentunderstanding/textcnn/model.py diff --git a/models/contentunderstanding/classification/reader.py b/models/contentunderstanding/textcnn/reader.py similarity index 100% rename from models/contentunderstanding/classification/reader.py rename to models/contentunderstanding/textcnn/reader.py diff --git a/models/contentunderstanding/classification/readme.md b/models/contentunderstanding/textcnn/readme.md similarity index 94% rename from models/contentunderstanding/classification/readme.md rename to models/contentunderstanding/textcnn/readme.md index 1326a55d9cbe9650b45ea99db8569bd7ef13b3a7..95fa9319096d1af0893b3b910773034c14554ee9 100644 --- a/models/contentunderstanding/classification/readme.md +++ b/models/contentunderstanding/textcnn/readme.md @@ -1,21 +1,20 @@ -# classification文本分类模型 +# textcnn文本分类模型 以下是本例的简要目录结构及说明: ``` ├── data #样例数据 - ├── train - ├── train.txt #训练数据样例 - ├── test - ├── test.txt #测试数据样例 - ├── preprocess.py #数据处理程序 + ├── train + ├── train.txt #训练数据样例 + ├── test + ├── test.txt #测试数据样例 + ├── preprocess.py #数据处理程序 ├── __init__.py ├── README.md #文档 ├── model.py #模型文件 ├── config.yaml #配置文件 ├── reader.py #读取程序 ``` - 注:在阅读该示例前,建议您先了解以下内容: [paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md) @@ -44,7 +43,7 @@ Yoon Kim在论文[EMNLP 2014][Convolutional neural networks for sentence classic | 模型 | dev | test | | :------| :------ | :------ -| TextCNN | 90.75% | 92.19% | +| TextCNN | 90.75% | 91.27% | 您可以直接执行以下命令下载我们分词完毕后的数据集,文件解压之后,senta_data目录下会存在训练数据(train.tsv)、开发集数据(dev.tsv)、测试集数据(test.tsv)以及对应的词典(word_dict.txt): @@ -73,13 +72,13 @@ os : windows/linux/macos 本文提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练: ``` -python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml +python -m paddlerec.run -m models/contentunderstanding/textcnn/config.yaml ``` ## 效果复现 为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。 -1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/classification +1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/textcnn 2. 下载并解压数据集,命令如下: ``` wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz diff --git a/models/contentunderstanding/textcnn_pretrain/__init__.py b/models/contentunderstanding/textcnn_pretrain/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/contentunderstanding/textcnn_pretrain/basemodel.py b/models/contentunderstanding/textcnn_pretrain/basemodel.py new file mode 100644 index 0000000000000000000000000000000000000000..3f183644934cd1b4d88b868ce93425d923ae8ca1 --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/basemodel.py @@ -0,0 +1,118 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +from paddlerec.core.utils import envs +from paddlerec.core.model import ModelBase +from paddlerec.core.metrics import RecallK + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + self.dict_size = 2000000 + 1 + self.max_seq_len = 1024 + self.emb_dim = 128 + self.cnn_hid_dim = 128 + self.cnn_win_size = 3 + self.cnn_win_size2 = 5 + self.hid_dim1 = 96 + self.class_dim = 30 + self.is_sparse = True + + def input_data(self, is_infer=False, **kwargs): + + text = fluid.data( + name="text", shape=[None, self.max_seq_len, 1], dtype='int64') + label = fluid.data(name="category", shape=[None, 1], dtype='int64') + seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') + return [text, label, seq_len] + + def net(self, inputs, is_infer=False): + """ network definition """ + #text label + self.data = inputs[0] + self.label = inputs[1] + self.seq_len = inputs[2] + emb = embedding(self.data, self.dict_size, self.emb_dim, + self.is_sparse) + concat = multi_convs(emb, self.seq_len, self.cnn_hid_dim, + self.cnn_win_size, self.cnn_win_size2) + self.fc_1 = full_connect(concat, self.hid_dim1) + self.metrics(is_infer) + + def metrics(self, is_infer=False): + """ classification and metrics """ + # softmax layer + prediction = fluid.layers.fc(input=[self.fc_1], + size=self.class_dim, + act="softmax", + name="pretrain_fc_1") + cost = fluid.layers.cross_entropy(input=prediction, label=self.label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=self.label) + #acc = RecallK(input=prediction, label=label, k=1) + + self._cost = avg_cost + if is_infer: + self._infer_results["acc"] = acc + else: + self._metrics["acc"] = acc + + +def embedding(inputs, dict_size, emb_dim, is_sparse): + """ embeding definition """ + emb = fluid.layers.embedding( + input=inputs, + size=[dict_size, emb_dim], + is_sparse=is_sparse, + param_attr=fluid.ParamAttr( + name='pretrain_word_embedding', + initializer=fluid.initializer.Xavier())) + return emb + + +def multi_convs(input_layer, seq_len, cnn_hid_dim, cnn_win_size, + cnn_win_size2): + """conv and concat""" + emb = fluid.layers.sequence_unpad( + input_layer, length=seq_len, name="pretrain_unpad") + conv = fluid.nets.sequence_conv_pool( + param_attr=fluid.ParamAttr(name="pretrain_conv0_w"), + bias_attr=fluid.ParamAttr(name="pretrain_conv0_b"), + input=emb, + num_filters=cnn_hid_dim, + filter_size=cnn_win_size, + act="tanh", + pool_type="max") + conv2 = fluid.nets.sequence_conv_pool( + param_attr=fluid.ParamAttr(name="pretrain_conv1_w"), + bias_attr=fluid.ParamAttr(name="pretrain_conv1_b"), + input=emb, + num_filters=cnn_hid_dim, + filter_size=cnn_win_size2, + act="tanh", + pool_type="max") + concat = fluid.layers.concat( + input=[conv, conv2], axis=1, name="pretrain_concat") + return concat + + +def full_connect(input_layer, hid_dim1): + """full connect layer""" + fc_1 = fluid.layers.fc(name="pretrain_fc_0", + input=input_layer, + size=hid_dim1, + act="tanh") + return fc_1 diff --git a/models/contentunderstanding/textcnn_pretrain/config.yaml b/models/contentunderstanding/textcnn_pretrain/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e39559ef4a8fbf498da33b4a6994863883428515 --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/config.yaml @@ -0,0 +1,70 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "models/contentunderstanding/textcnn_pretrain" + +dataset: +- name: dataset_train + batch_size: 128 + type: DataLoader + data_path: "{workspace}/senta_data/train" + data_converter: "{workspace}/reader.py" +- name: dataset_infer + batch_size: 256 + type: DataLoader + data_path: "{workspace}/senta_data/test" + data_converter: "{workspace}/reader.py" + +hyper_parameters: + optimizer: + class: adam + learning_rate: 0.001 + strategy: async + +mode: [train_runner,infer_runner] + +runner: +- name: train_runner + class: train + epochs: 6 + device: cpu + save_checkpoint_interval: 1 + save_checkpoint_path: "increment" + init_model_path: "" + print_interval: 10 + # startup class for finetuning + startup_class_path: "{workspace}/finetune_startup.py" + # path of pretrained model. Please set empty if you don't use finetune function. + init_pretraining_model_path: "{workspace}/pretrain_model/pretrain_model_params" + + phases: phase_train + +- name: infer_runner + class: infer + # device to run training or infer + device: cpu + print_interval: 1 + init_model_path: "increment/3" # load model path + phases: phase_infer + + +phase: +- name: phase_train + model: "{workspace}/model.py" + dataset_name: dataset_train + thread_num: 1 +- name: phase_infer + model: "{workspace}/model.py" # user-defined model + dataset_name: dataset_infer # select dataset by name + thread_num: 1 diff --git a/models/contentunderstanding/textcnn_pretrain/data/preprocess.py b/models/contentunderstanding/textcnn_pretrain/data/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..990a07040b82dd654dce3155fc89a37f8da1aedc --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/data/preprocess.py @@ -0,0 +1,67 @@ +# encoding=utf-8 +import os +import sys + + +def build_word_dict(): + word_file = "word_dict.txt" + f = open(word_file, "r") + word_dict = {} + lines = f.readlines() + for line in lines: + word = line.strip().split("\t") + word_dict[word[0]] = word[1] + f.close() + return word_dict + + +def build_token_data(word_dict, txt_file, token_file): + max_text_size = 100 + + f = open(txt_file, "r") + fout = open(token_file, "w") + lines = f.readlines() + i = 0 + + for line in lines: + line = line.strip("\n").split("\t") + text = line[0].strip("\n").split(" ") + tokens = [] + label = line[1] + for word in text: + if word in word_dict: + tokens.append(str(word_dict[word])) + else: + tokens.append("0") + + seg_len = len(tokens) + if seg_len < 5: + continue + if seg_len >= max_text_size: + tokens = tokens[:max_text_size] + seg_len = max_text_size + else: + tokens = tokens + ["0"] * (max_text_size - seg_len) + text_tokens = " ".join(tokens) + fout.write(text_tokens + " " + str(seg_len) + " " + label + "\n") + if (i + 1) % 100 == 0: + print(str(i + 1) + " lines OK") + i += 1 + + fout.close() + f.close() + + +word_dict = build_word_dict() + +txt_file = "test.tsv" +token_file = "test.txt" +build_token_data(word_dict, txt_file, token_file) + +txt_file = "dev.tsv" +token_file = "dev.txt" +build_token_data(word_dict, txt_file, token_file) + +txt_file = "train.tsv" +token_file = "train.txt" +build_token_data(word_dict, txt_file, token_file) diff --git a/models/contentunderstanding/textcnn_pretrain/data/test/test.txt b/models/contentunderstanding/textcnn_pretrain/data/test/test.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5dd69e3f35c29bb02f070be16a0af02ecfeae89 --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/data/test/test.txt @@ -0,0 +1,20 @@ +5681 17044 4352 7574 16576 3574 32952 12211 18835 28961 15320 2019 21675 30604 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 1 +9054 31881 4449 12211 12488 5975 3574 28592 2547 2547 14132 3574 24908 5975 24285 10010 3574 31872 20925 9886 12211 26530 3567 30818 19640 22506 28312 19887 12211 28212 8576 3574 28592 12306 14132 539 33049 9039 14160 113 3567 19675 5511 2111 623 12068 12211 3574 18416 12068 19680 12211 30781 21946 1525 9886 3574 28109 31201 3567 25710 30503 30781 12068 19887 12211 22052 3574 2050 5402 10217 31201 1525 9698 14160 19887 3574 26209 24908 539 33049 9039 32949 8890 29693 3566 3566 11053 30781 26853 3567 3567 0 0 0 0 0 0 0 0 92 0 +19640 32771 31526 16576 13354 3574 5087 30781 7902 19037 12211 0 3574 4756 15048 11063 0 15019 16576 2019 29812 2276 22804 13275 2019 24599 12211 30294 6983 26606 1467 3574 18448 8052 16576 23091 32440 11034 16576 3574 1470 6983 1346 31382 13354 3574 11711 10074 28587 5030 19058 16576 2019 16497 6890 12223 30035 6983 1112 18448 30837 11280 24599 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 64 0 +7513 19838 3562 32737 15474 3562 1887 15474 0 0 18835 19813 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 1 +30325 3574 30788 12211 25843 11533 30150 8937 11309 8690 12211 14166 2200 3574 15802 0 20424 14166 25336 113 16576 11533 24294 12211 26301 16576 3574 28592 16191 12211 8690 13743 0 517 12211 0 0 23958 3574 31019 19680 13841 15337 12211 23958 30781 28630 3574 8690 12700 11280 12211 23958 24908 20409 7481 8052 6094 4002 30245 3574 1526 9904 27032 31347 24006 12211 14166 0 9910 24908 12211 0 2019 25469 17293 27438 29774 13757 24908 22301 28505 25450 12211 14039 3574 28801 4621 4879 3574 623 9904 23958 14166 18417 4895 113 11114 2018 113 100 1 +113 16576 17947 28955 12211 24253 3574 22068 30167 12211 14039 30818 28640 7801 2019 7985 30167 5402 6805 0 12211 27645 33067 30151 3574 11110 12211 10710 4549 22708 4308 24908 25975 12211 26957 0 2019 17942 25575 227 19641 1525 13129 113 15492 23224 3574 21163 15565 23273 29004 12452 13233 27573 12211 12046 2019 302 19367 16576 27914 0 0 113 12211 28035 0 13743 13330 24390 12466 1525 12537 3574 18131 2019 9315 25720 27416 2276 15038 18162 10024 28955 3574 10097 18162 26594 12211 21949 3574 30788 12133 26362 1779 27386 21017 14295 1525 454 100 1 +33022 4169 19038 25096 3574 19185 113 25010 0 0 10511 17460 28972 6574 3574 1409 0 10010 3574 33022 129 16186 10511 17460 15182 3574 20235 10511 17460 11226 27150 13166 3562 18835 19038 5391 3574 22195 8052 28892 31948 10960 3574 13367 29338 15048 11030 22185 18621 28776 5205 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 0 +23439 330 0 0 29655 12211 3574 4211 3574 19650 19640 13757 3562 0 0 8990 330 0 0 18920 12211 31924 6688 31857 15364 3574 19641 30781 18416 28952 9209 12211 118 10710 16912 3562 0 0 27771 330 0 0 10126 30325 3574 15374 4348 0 6356 28420 24193 29526 12211 10523 21872 3571 24383 1580 3574 17536 1525 14745 21674 10710 4952 14871 3574 14590 20306 7695 0 32718 3562 0 0 13260 330 0 0 5847 30325 3574 25951 26995 21163 22787 15535 20889 3574 27914 5391 130 2276 15243 6356 0 16576 3562 0 0 100 1 +24908 32568 24044 28952 16576 27914 28955 3574 14160 13543 16582 5536 2019 11711 3527 19675 12211 15474 3574 0 14160 31857 30927 2019 18416 9231 12486 12211 20374 3574 1111 30173 19058 3574 31857 31825 3574 30170 15501 21070 2019 31383 19640 5004 3574 31858 12211 6408 2733 8034 24870 12730 12211 16401 2019 18416 19640 9072 18416 12211 2313 12211 20374 3574 18416 2313 25575 19315 31383 20374 20161 24160 3574 11711 3527 3574 31383 20374 31857 28378 2019 1296 5402 23273 16576 2019 16497 28952 2019 9512 15038 5536 3574 11711 10486 15168 19641 21994 0 2019 100 1 +0 7902 5402 29107 16576 15535 15535 15535 0 19634 21017 12211 26505 14160 15129 0 15535 15535 15535 26211 4002 9749 23360 16576 15535 15535 15535 26040 15535 15535 15535 15535 11698 32986 19641 0 22421 15535 15535 15535 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 0 +28955 17755 3574 1735 18232 19262 12992 12230 3574 18416 30781 7388 19680 19643 16576 12211 3574 28952 9209 3574 16572 22360 2019 19680 19643 6414 12211 2011 27666 2012 3574 13757 32205 3574 14754 11280 12211 22186 7628 1827 17413 3574 19641 30781 31383 12211 4853 2019 33140 113 6047 6414 3310 31383 3574 4654 22360 6580 26147 12211 18696 2019 12306 6414 20539 3574 12680 22360 18624 8051 29384 1146 2019 18046 33188 16582 29384 12211 17311 13222 3574 18416 7453 28961 8014 3574 11711 18416 28961 17658 3574 29384 30781 19893 19643 15073 12211 32171 12211 2019 100 0 +28955 12211 30964 14590 28961 4412 29183 29493 6393 17111 29183 11670 12211 19636 23233 28961 4412 29183 25469 1112 16603 14590 16720 28961 9749 32365 23958 12211 33245 1525 11271 29183 29607 4694 8052 12068 32247 26813 29183 12229 6856 3674 330 30326 972 32948 29183 18416 28961 20161 1120 19641 30054 28955 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 55 0 +28587 26594 16393 14439 20100 8452 12211 11738 3574 20288 2276 2770 9051 29266 3574 27097 12211 0 14648 7902 5827 4308 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 1 +19083 3561 20034 30173 8356 3574 18416 18016 6154 13757 30827 23410 4879 5213 3566 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 1 +28587 14745 2018 1580 3574 19636 9052 14160 19683 16576 0 0 6007 5361 26370 5391 785 3574 0 17010 28587 27857 19048 20558 9051 3574 6007 0 0 22897 18323 1447 2019 0 0 32391 17536 24961 19048 9749 18448 3574 24283 6356 7648 26789 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 47 0 +24908 18920 1400 665 16167 12211 17293 3574 13518 28952 8393 23504 3574 31266 12211 30781 4477 2019 4654 18896 4289 13841 4822 3574 24908 27376 15243 18416 8052 20077 17493 17317 3574 14842 16949 3574 12081 28961 2276 0 14399 20158 14398 16335 12211 3699 7697 6318 69 2019 11924 8053 27376 12211 14039 3574 21210 23273 3574 1732 30818 17942 22561 3083 2019 17268 12700 28892 9108 16576 26203 19037 23872 3574 14988 31773 3574 33140 1725 24908 0 8053 8052 13841 3574 25944 0 2019 4032 5025 13841 19185 12211 14039 3574 665 0 12211 4822 6988 100 1 +29728 31619 6149 5402 113 7317 11738 3574 31482 11924 16576 17657 6541 9761 3574 31224 5402 21141 3574 6356 16191 19640 14451 26154 7192 16076 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 0 +29302 11364 19059 13652 12211 3574 7898 30781 6356 7961 14954 21752 7340 2019 29302 11401 8328 3574 20384 20034 1460 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 0 +4592 12211 31382 11030 3574 7961 6356 136 11714 31881 31478 3574 7957 11533 17413 3574 18835 14451 14550 11533 389 3574 14444 20444 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 1 +18416 24908 0 5233 22185 12211 29183 18956 30781 9668 8904 15168 18416 16108 29183 18416 29123 4351 28845 11709 11731 30486 21200 3574 4351 32986 8052 13757 11711 16497 25138 18448 3006 30326 20837 6356 16060 11231 13757 18448 11731 29173 3576 18835 27924 11711 11533 11225 3574 17386 15934 7288 0 26216 12211 1542 3574 24908 12511 18416 16060 11231 32842 18448 11731 29173 3574 18956 9668 31387 755 32986 18416 28972 18855 30781 18448 3006 30326 20837 30781 8052 13757 15048 18448 11731 29173 12211 3574 19640 18584 18416 32986 25710 18416 2276 29173 12211 22052 24908 100 0 diff --git a/models/contentunderstanding/textcnn_pretrain/data/train/train.txt b/models/contentunderstanding/textcnn_pretrain/data/train/train.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0afee9764604a96731197ebe94a0247efcf0552 --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/data/train/train.txt @@ -0,0 +1,100 @@ +18416 31857 28378 25778 3574 16021 14449 16576 2019 33140 3574 5787 3574 19916 10505 12211 20017 23235 113 14681 24558 3574 20424 4895 11533 5901 28955 11533 15033 28955 12211 16603 32948 3574 11406 12211 30781 21299 12211 14871 2019 11698 12700 24160 14160 18448 25473 12211 16603 10671 23154 11280 28955 12211 24558 10006 3566 4247 18416 25336 22608 31382 16576 3574 8314 19916 13367 10367 12211 14039 20061 8475 3574 9951 9904 16586 28093 12211 14871 26235 21017 3574 19641 30781 21599 6811 8855 12211 8052 25825 3574 7628 21599 2721 11280 12211 8052 25825 2019 850 100 1 +23403 12211 19185 24908 3367 13526 14150 5402 32094 30560 4347 26961 16576 33148 5231 16576 3574 32901 7513 12795 19838 12211 33148 28938 14150 218 33148 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 1 +17925 30781 3079 4906 12211 32941 16576 29183 31514 30781 9683 12211 11101 330 19521 31670 29183 31719 30325 330 0 0 10038 7513 28961 19640 13757 330 0 0 18835 28587 3404 15492 1378 6536 12211 7272 29183 33140 12700 18381 21897 12211 755 2404 1378 31382 29183 0 0 7404 29183 30518 29183 10018 18189 2238 10505 16576 31382 28253 25663 6210 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 65 1 +31619 28955 32665 18448 16576 20205 12211 21872 23273 16576 2019 1272 25336 28030 13330 27519 1149 8011 12211 5536 3574 10710 31082 16582 23403 1525 4982 12211 8855 2019 1067 10362 22360 1409 22739 3574 18046 32698 8011 32665 23959 26050 13256 19080 2019 23273 8011 3574 6154 13233 3574 1112 16603 12700 19641 28109 12439 8011 3574 4694 19038 15369 16576 2019 11711 6988 12700 33137 8011 12211 13806 14006 28972 8452 3574 24073 1112 16603 5132 2877 88 1525 30788 24131 12211 17961 2019 22360 17174 22496 16582 8045 12211 27056 3574 1525 433 4895 1614 27401 100 1 +19037 17415 3574 31482 24908 33082 19922 12211 6798 8053 30781 118 3434 12211 6053 16576 2019 31619 22223 14132 5202 28961 15567 3574 18448 19080 28587 15492 11738 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 1 +18416 30781 16255 22185 12211 3574 30752 3574 9218 12211 30781 28541 12675 22540 11711 33022 17824 29723 23773 6356 16576 21440 17460 12211 14444 18401 3574 2404 31382 26954 31382 2019 15802 31562 12634 28584 12211 3574 19601 1525 3731 19838 3434 3574 6917 18416 28961 31562 3750 12211 5048 6400 28584 2019 31906 30781 18835 26370 11225 2018 25517 6400 16716 3610 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 65 1 +32197 19680 18743 15474 13128 5402 19893 18416 12115 28952 16576 28961 5511 30781 18416 10968 12211 11738 8331 6105 12211 8011 18416 623 24906 14160 14451 19680 16448 25566 18416 12115 12211 16576 20653 20653 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 0 +19185 10960 1346 16144 7574 29183 11711 3610 26219 29183 26281 14897 30818 20485 15474 6010 29183 6544 1467 29183 11711 6698 12211 30214 14897 13841 30781 10018 5354 12211 29183 17087 16364 5213 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 34 1 +19289 8452 3574 1470 19037 3574 4348 14599 14957 118 6789 29356 3574 29271 13783 11533 5903 3574 13932 28961 27924 2019 2885 13443 3574 2276 20051 19081 19640 19675 19838 3574 14898 20403 12211 27359 24908 8736 5391 13443 29888 3574 16868 14160 23249 5354 2019 6007 16576 5361 1525 11175 26234 24294 3574 26574 7317 14160 24882 2019 19048 18448 12211 2885 30781 32391 24215 31527 12211 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 70 1 +30033 23176 22185 22056 19185 3574 2216 1525 10023 370 31591 15759 3574 6988 14480 19641 29128 19185 3574 14988 3574 19185 24690 13872 32020 19638 3574 33140 3574 24908 11851 12211 10881 24535 113 29723 12211 25758 3571 19821 13354 23104 21674 12491 19185 2019 19185 12211 18835 27924 3574 26710 113 2181 3574 18416 11714 16576 15343 27801 23983 14444 12211 18835 3574 11755 11231 11030 3366 28039 18897 10681 12211 18835 3574 14160 13757 665 15143 3574 15896 3574 2882 17460 24130 12211 18835 24193 4895 28670 3574 6400 19433 2019 19185 12211 26281 28961 31857 15364 100 1 +6400 23653 330 11714 12211 13987 18401 330 18835 5391 16576 13354 330 21163 4352 5289 12211 330 28782 30781 14458 12211 29183 12677 5935 5087 28109 15474 29183 32062 11528 16603 31224 19080 5186 12211 330 10505 9142 10272 29183 24193 16586 7105 28961 30325 12211 330 24908 580 12211 10 2412 29183 19640 19838 12211 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 58 1 +5361 8053 7353 18448 3544 3574 113 11991 5004 4879 10006 3566 3191 113 31924 8707 12211 30096 12211 26041 24870 8035 31924 3574 6007 16576 1732 6141 8052 20273 3574 13354 18410 13841 14451 16062 2019 5847 31525 20815 18416 16546 0 0 16020 10074 26968 3574 20815 18416 12211 14798 21113 29276 31382 12211 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 56 0 +2213 26361 30299 3830 29732 31331 23773 3574 14274 21516 23443 12211 27097 19048 27801 11175 12211 3574 28587 5361 24294 12211 19048 3574 29123 30084 15934 4348 7648 8053 3574 13841 7648 8053 6141 8052 20273 3867 3574 30209 28161 6154 30607 3574 8707 3366 12627 2019 25710 14451 2952 13757 1112 11632 11030 32971 22421 19038 13543 16576 28961 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 62 0 +22056 19185 30781 2928 12775 12211 11169 12211 3574 5372 11533 25096 3574 28768 755 4032 24908 19598 12211 2456 29406 2018 30227 2018 7 3223 3574 31509 3567 0 0 25373 12824 0 0 31604 0 0 3561 0 0 19037 18835 31527 28587 5391 28782 3574 10960 11578 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 51 0 +22185 12211 14039 18416 18673 9951 3366 9796 12211 18835 3574 16460 7961 4312 12211 14039 14132 11280 22195 4002 31562 20910 15243 2019 0 0 23896 10202 32062 8053 3574 19703 25713 21017 17886 12211 25085 30083 6757 3574 31482 2807 3574 8657 12211 6793 24908 16720 19058 28961 3234 2457 30083 3562 0 0 23896 11230 19640 13757 28030 1881 7005 8219 3223 12211 13932 2018 10157 2888 24382 12211 13932 2019 0 0 3717 13642 3574 8483 5213 3567 0 0 23896 31088 32842 23360 18835 3574 17248 12211 4352 23954 3574 31514 24193 29812 12211 18835 100 0 +7628 19680 20303 11528 26620 25663 6154 28952 12211 3574 6259 8314 33192 19037 24379 33192 19037 3574 15565 4002 5681 26336 10002 2019 32794 8855 141 23484 31527 12211 16401 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 0 +129 22185 18855 3574 19680 16576 25478 12211 12824 6988 20034 9324 12211 2019 22185 10010 6400 19640 13757 3574 24908 4798 5681 27359 3574 32986 4694 12211 10671 11714 6356 18162 12211 17044 8630 32986 30781 30325 16576 2019 19081 13841 4798 18060 11030 22056 19185 12211 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 49 1 +18835 23653 3574 19059 19037 3574 21163 4429 113 13740 7206 12211 23964 3574 9922 12211 21264 6988 19038 24598 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 20 0 +22056 19185 22195 5402 32033 5354 9603 12675 29183 30781 32857 9792 20894 12211 29183 22739 4002 31514 12211 11101 29183 13944 18416 6570 11714 16576 13285 29183 939 14648 746 29183 14124 5407 29338 4324 3500 29183 28500 3816 12211 30781 16600 24863 24383 29183 31593 11533 22175 29183 2457 32857 12211 31223 29183 2755 25803 4002 26849 12211 5930 29183 26211 28184 12211 16842 4364 30781 12736 29183 4196 21852 26281 11533 25096 21852 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 76 0 +23653 12211 19185 29183 0 20718 15474 29183 0 0 5402 24908 22607 20342 330 0 0 8060 17460 26370 9995 29183 0 0 113 30526 28961 113 21017 24661 12211 7840 29183 0 0 21163 8052 10784 20561 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 1 +23273 16576 3574 11182 12211 2436 14572 3574 33140 11670 3561 5067 19611 12211 14518 3574 15320 5205 3567 25778 32294 6400 14399 27735 8171 22867 14398 2019 16497 2929 13841 26789 8393 3280 5402 11528 16576 3574 6563 31930 28952 16576 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 43 0 +10126 3045 30781 11502 24080 24129 12211 3574 7695 31523 5233 2019 19334 2019 0 0 19650 29276 2729 3574 10126 5202 1443 16568 3574 13443 18674 4352 19210 3574 21009 32197 1409 6149 5391 2019 9052 21782 23653 9998 2019 0 0 20493 6259 22315 15880 3574 19106 4352 3662 3574 671 1164 30604 3574 15768 21401 16576 30326 153 8097 12211 20249 3574 25695 12760 3574 7453 30215 19767 3574 13783 12441 2019 0 0 27519 30173 5391 12211 11738 3574 18896 13757 14223 11280 12211 2929 21872 3574 28961 25863 32028 3574 6359 2019 0 0 0 97 1 +15567 12211 3574 4319 3574 14388 3574 24458 12211 11661 28961 28587 4844 12068 12211 2521 25096 2019 30604 7695 3940 16576 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 1 +18835 3880 19038 162 29183 19636 11421 27801 4412 18448 29183 24863 24752 23690 29183 14988 18416 20273 16576 25801 0 0 15896 4002 3880 30273 8052 2075 29183 12497 30861 6400 27801 21960 24586 29183 26965 244 12211 16400 1364 330 330 330 0 0 32294 6400 7957 19640 13757 29183 3880 11101 5930 330 330 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 58 1 +32941 9661 19593 30781 31857 6789 16737 14221 12211 27359 29183 7317 10960 25096 16582 31679 16156 29183 7957 26814 29183 26281 27653 4352 11578 29183 30991 4352 15369 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 1 +18065 18416 20288 11714 15243 28892 18162 12211 6259 27771 4222 3574 18416 28961 32948 7502 2019 2019 2019 0 0 875 4694 30781 26979 16576 3574 18835 19538 6988 19643 12211 13841 3574 33140 1887 11053 4002 26705 12211 3562 17946 4412 11697 3574 7961 12653 4002 9749 18448 16576 3574 29123 16576 32986 30781 27416 7410 3562 1556 2276 16576 4369 0 0 17800 3106 0 0 10018 16576 9756 14451 10008 20206 3562 6223 23959 26050 3146 2018 29680 9213 1558 17056 3574 7961 25868 12211 13197 13652 16576 13221 3562 0 0 18295 5667 12211 16578 100 0 +6594 4211 3574 14075 28961 30325 3574 29607 10505 23485 12211 21872 26370 27593 5213 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 1 +4006 2082 0 0 31711 15960 12211 31924 28587 19976 7823 20076 15492 18448 12211 0 0 6007 12211 14039 14160 5511 0 0 8052 6007 5266 0 0 20034 6937 22131 18448 0 0 6007 5266 5511 3375 30781 3375 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 41 0 +7288 28955 5402 19819 31138 16576 8035 15565 3574 12700 30604 3574 6400 113 17174 21924 22358 15243 15048 29418 454 3574 18448 11280 12211 11178 12589 19643 12211 6400 3574 31482 15038 11528 11533 22943 3574 28892 30781 9638 3228 3574 14274 8052 28892 28955 2019 30131 28952 3567 5511 30788 12211 9860 15492 30781 15492 3574 11711 688 28955 2106 3574 31857 3292 3574 21009 30781 8551 13354 28955 28961 19643 19643 27416 24908 12211 15492 16603 14160 10671 14318 12211 14945 3608 10002 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 87 0 +25778 30781 6811 2313 17293 12211 3574 16191 25104 30781 32229 12728 25172 5205 3567 25710 30781 20288 1384 28505 3574 24073 113 20341 12211 9662 3574 19663 12211 30781 31962 11698 23401 28505 12211 29694 3574 4781 28505 12211 29694 3574 7353 14564 20478 12211 3513 3574 7353 4181 14039 25761 1086 3574 7353 21909 19024 11738 3574 7353 18837 1525 16070 28505 12211 20005 2019 25710 11698 30781 11280 4242 28505 459 12700 1324 7192 31382 3574 24908 11698 2106 20341 3574 11711 20424 28505 32948 6580 31857 7822 12211 12114 3736 2019 18188 16576 3574 459 26027 100 1 +18072 1443 1525 19650 14160 30325 3574 10126 2018 17536 2018 5847 4117 3574 25951 0 0 25448 22512 27967 18416 12211 5675 3574 30604 10979 3940 2019 21516 1164 13757 12211 3574 11935 3057 12542 8082 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 1 +26420 12211 18835 4352 5391 3574 32197 30781 24908 685 3574 25469 32544 28961 28109 15474 3574 19362 12211 10960 16716 31382 3574 14550 22810 3574 18416 11714 12211 2050 29128 4648 4037 16576 3574 11711 32197 18956 18358 12211 25336 32062 16576 3574 25469 28961 28587 13841 23360 3574 33140 12488 5402 4412 16576 2019 118 8052 28378 12211 30781 15202 12211 1887 3574 32197 6811 454 14871 3574 25469 3817 7340 16576 32139 3574 26609 28961 28587 13197 24889 32986 31331 31088 7340 3574 11711 6356 16576 33022 10010 5402 18232 17750 8052 20077 12211 2971 3561 623 100 0 +11110 11670 20034 6536 3574 28892 28199 2018 13789 14148 15243 19019 10010 12452 16448 10002 3574 21163 11110 31527 12211 16132 6988 16100 3574 20374 14988 2245 3574 33140 16292 24001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 0 +15186 22185 18624 7014 3750 28893 3574 4181 7272 4412 3574 19157 19730 11231 3574 9583 113 29829 6279 12211 1166 2019 7477 11231 3571 19037 19055 26234 21674 478 2454 3574 22185 11231 18232 9980 30781 20102 7132 12921 3574 15802 10734 21617 2019 23925 8052 16006 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 49 0 +43 2162 28587 10525 12211 22686 3574 28587 28038 12211 18555 3574 28500 28587 5300 12211 26619 19367 3574 31383 113 12211 30781 1183 27064 12211 4008 3574 30781 10024 24073 3664 22068 11528 17956 3574 1112 1370 22068 16576 10439 2236 6483 12211 11610 2019 24229 22068 43 2162 3574 4116 31527 5402 31872 23009 16522 29493 3574 28892 13934 19680 12211 26530 22102 3574 2050 29493 4477 14274 6735 2019 30170 7961 32247 18855 22068 4879 3574 6400 15227 3567 21163 25778 13459 10210 1525 16434 8038 13740 3574 685 12211 11110 14160 28587 3574 11408 18416 12700 100 1 +16699 12211 15661 2276 20076 12211 12217 15413 3574 10505 23485 12211 14039 5402 20409 4895 2068 22608 26050 3574 2547 17273 16635 12211 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 +11110 8060 25497 5402 21488 16576 4895 26235 27782 12211 31116 3574 18416 5511 28312 30788 12211 7194 17623 2019 1409 11342 8052 28986 24906 5395 25778 8051 13092 6235 25920 4352 2019 14274 24648 18896 4002 30239 28587 11525 12211 4895 23723 2019 21163 8060 25497 5402 20274 8035 17517 20206 2019 12306 6154 14132 19641 6988 302 4895 23126 12211 14039 2019 18416 3360 20909 14399 19356 14398 29338 30781 29391 10002 2019 20981 19770 30608 3574 5681 1112 18416 15881 2019 17174 23712 19363 28961 20108 16576 22870 11110 12858 2019 13334 7230 5681 26779 10671 5395 100 1 +24379 3567 24135 30781 113 6022 12211 14844 11670 12211 28955 22408 3567 16460 22195 28587 3567 17305 2717 16576 4879 32665 6143 6356 24229 16576 3567 19514 12211 32971 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 0 +14988 28952 21300 30934 16576 3574 28592 18416 25575 8102 24908 28541 29552 12211 5930 15243 2019 20484 3574 19680 16576 19262 12211 22176 3574 4694 30781 28014 3574 13518 12068 6132 8393 28955 21017 16461 11280 2019 19641 30781 8393 10704 30809 22470 19055 16457 19055 2973 3574 5244 413 3574 14645 19055 2159 19055 21987 640 12211 10238 10899 2019 6305 6305 15634 32698 3574 12700 15740 20112 3574 13518 7133 29463 6421 19080 2019 20567 3574 19640 21024 2416 1904 3574 1125 32986 22176 30325 3574 3375 24830 22421 3574 6359 3567 17840 3574 21982 14842 17154 100 1 +20288 22185 14120 3574 16862 10199 3574 15759 8106 2019 18835 28993 21231 2018 31662 3574 30861 3610 9744 3574 31482 113 16694 1525 19649 19887 3574 113 9231 12211 6400 3574 7957 17413 29711 2019 17294 7604 12211 30781 20263 26146 3574 21163 17925 26563 5402 13757 3876 2019 11714 10010 1598 3561 7972 12211 7957 3574 15202 28500 8645 3562 7972 12211 9958 3574 15202 28500 7882 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 70 1 +24832 30788 1257 13460 30572 3574 11711 28961 17716 11712 19826 3567 30918 11670 2246 26050 21599 29717 12211 11409 1525 32887 3574 18416 12068 19641 10648 8052 4799 30781 4247 12211 1816 1525 8855 15895 3567 30918 12211 22467 11429 30781 8060 12837 9075 3574 11711 9209 6700 5402 12881 4895 12837 12211 10997 14648 14648 28587 6455 6356 6146 24194 6356 12211 8855 6700 32965 2018 10905 1607 2018 20051 11924 14150 12211 6611 3574 18448 4059 12211 26812 21017 19024 26574 12211 11738 3574 31039 30781 18717 12211 3567 30918 32197 19916 12211 17293 3400 3574 18416 100 0 +16716 28378 15423 30452 3574 25234 6608 19358 3574 6988 28541 12211 13896 31382 3571 13896 0 0 30783 3574 26345 10064 0 0 28281 0 0 3078 3574 14160 10074 26753 0 0 15423 30452 12211 29371 21674 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 0 +5087 30301 12211 24708 1525 11309 12211 16716 22102 3566 20424 4895 113 15364 13119 4066 12211 16603 32948 3574 25778 31527 12211 27914 14871 14160 16716 21802 3574 17294 21802 12211 29399 16497 4002 3561 12876 1118 26050 3574 23617 1525 19611 30781 19872 1526 6611 12211 25641 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 50 0 +30299 24294 30990 19514 3567 26602 5402 27439 863 2019 5202 19037 19041 6146 23960 3574 13061 23427 147 5202 17111 18415 14095 12211 16603 9270 30781 24908 21628 3567 4921 20245 3574 8707 15661 10010 26965 8639 22002 3562 28587 29140 26787 3562 12673 9738 19398 3562 29061 3367 16162 31633 3567 10126 1525 17137 12211 16877 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 59 0 +5681 26697 12211 19289 6988 12558 30325 12211 3574 18416 19992 25336 11533 21293 24906 7237 19643 5681 26697 16576 2019 20424 21264 8052 4324 30596 12211 5365 32948 3574 5681 25687 12211 22512 25336 22483 30325 16576 2019 2158 29265 11962 12211 3574 6563 12211 25732 16258 8395 15768 2276 24598 23673 0 0 19650 30325 3574 31375 11533 11063 0 0 1095 31418 0 0 14487 31857 28378 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 70 1 +8397 12211 25341 14006 9766 2117 3574 30781 130 17936 2019 26211 32568 32568 27241 12211 20395 12211 19641 28955 531 1378 25096 3567 22068 6356 31224 12211 14039 3574 685 12211 9347 5402 21020 16576 3574 33197 3574 25336 113 28896 27126 20206 16576 2019 19675 1112 16603 3896 2019 19675 30388 10519 24073 113 531 26620 25096 12211 28955 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 62 0 +29103 10439 30847 16816 3574 20235 28640 8052 16576 24294 3574 10973 11231 13932 27924 3574 25336 26235 30827 23360 4895 17056 3574 8314 16699 10671 471 4243 18416 26235 17056 3574 14988 20288 24908 16699 21566 5402 564 26620 23424 3574 11711 6988 3628 16699 3574 20235 8314 2721 20076 11528 21371 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 54 0 +17926 30325 3574 22512 28961 13757 3574 4002 12775 15243 15038 12211 19376 15802 30781 11449 3574 26073 23773 16576 2019 2019 2019 1112 18416 15944 8053 16576 19048 2019 2019 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 1 +20860 28184 1110 12211 31498 3574 18920 32986 30781 343 3574 6356 27359 21853 623 6393 28892 3574 29123 755 19641 30781 19789 10006 3566 5776 3574 30781 12211 5213 3574 14334 27172 15243 18448 7464 7696 18847 12211 167 32630 2019 19641 4002 343 3574 18416 16164 2019 12283 25614 9578 3574 20993 30781 343 3574 31498 30781 16919 17460 3574 28587 343 3574 31514 15701 2019 18835 27924 3574 3880 11533 11225 3574 28784 28961 11533 11225 2019 21163 10033 32400 3574 29147 30781 8467 2019 26281 16716 31382 3574 2001 28587 25536 3574 25626 28961 28109 19038 100 0 +28109 30504 15535 21163 28961 9749 32842 10611 21852 21264 31285 21852 21440 330 2945 17339 6400 28801 21852 32637 4352 5390 21852 24460 4352 22943 21852 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 0 +8442 1378 24883 3574 21163 10960 10167 18016 3574 7898 30781 28738 13987 18401 3574 5268 28587 21752 12211 18835 2019 0 0 22185 10010 17392 11034 15168 12921 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 1 +14988 23360 16576 3120 3574 17399 19456 23677 3576 13367 8630 29338 15048 11714 16576 3574 19289 14489 5205 0 0 17044 84 0 0 15824 0 0 3561 0 0 451 8789 21599 130 19185 12211 12070 3567 19470 19185 113 23555 12211 11634 20635 7104 20544 3574 0 0 6811 5395 2739 24906 29498 26629 12211 7957 12675 3574 0 0 17149 8789 32299 23247 3574 0 0 31682 28426 8789 31797 3574 0 0 9860 7928 21852 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 80 1 +19185 12283 28892 20552 27538 8060 17460 9583 3574 6259 12211 14399 10372 14398 12211 12890 10074 18416 11714 19643 12211 7894 4895 9885 12211 9701 28541 9465 19640 5391 3574 10864 28782 2443 11528 13736 11528 30482 27874 14160 5354 3574 28500 27886 21267 12211 30781 18746 5391 11528 31228 6167 24073 17456 3567 19641 30781 18416 11714 19643 12211 118 11378 12211 10372 3574 16477 19640 1749 26581 13285 3574 21024 9904 14342 19680 11528 19038 6061 16576 3567 3935 1151 113 18162 21608 19185 21024 160 11280 12211 3120 3567 22063 28184 21823 12360 10084 21608 13615 100 0 +25710 8397 31127 15874 3574 16497 18454 2220 10896 28381 3574 30217 32254 2600 113 11706 24721 20641 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 1 +30781 21608 11533 16523 12211 19185 3574 7513 19838 3574 18835 28961 18010 3574 7961 28587 13197 32976 3574 11714 16576 21931 3574 113 6047 11714 10141 18835 3574 24323 4797 8052 32007 3574 21009 30781 28212 11738 3576 7957 531 15474 3574 6570 22185 11231 28587 7362 31382 17946 3574 755 20615 5402 28585 19024 16576 3576 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 58 1 +19185 113 17388 3574 11226 24863 3574 19893 16603 12211 17926 6400 11533 11861 3574 22195 9749 7920 343 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 19 0 +6759 12700 8052 11935 8060 28952 2019 8052 28378 141 29676 12211 11324 2019 28378 25778 12211 3574 28385 30781 23115 12211 27401 3574 22971 32986 30781 28378 25778 3574 8052 10362 32986 30781 28378 30788 24761 20206 12211 2404 3574 30781 23115 1112 4844 11896 16576 11472 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 49 0 +15622 12211 29620 2019 16100 2019 14988 31619 12211 28955 20815 18416 5210 12211 30781 18203 12211 2019 15345 30781 20815 28541 12211 28955 16877 14599 12211 2019 3495 12211 7628 8393 25810 12211 28955 2019 33140 3574 17926 6400 23653 2019 22711 4358 28760 12211 27742 28760 12211 25066 32121 26050 547 1525 29620 2019 8376 18232 23179 24800 12211 5901 2019 29620 12211 69 16100 3574 8998 12211 18666 8034 17746 12211 13372 5402 26620 3646 6356 11698 17942 2019 14451 18411 12211 5402 113 19709 9974 16576 2019 28077 8034 12211 3375 24355 3574 6811 26231 7966 100 1 +24908 30740 24729 1906 16942 330 10869 3216 26345 8052 20206 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 1 +18416 17293 4844 2902 3696 16576 4895 3741 3574 13757 10608 31383 12211 2050 130 2011 13363 2012 3574 20424 8314 9315 29717 13874 1525 21488 11280 12211 16603 32948 3574 19641 30781 8393 31857 23358 12211 28955 3574 13801 8034 27581 26050 13255 1525 5901 2019 15673 11698 24073 16448 31383 2276 24761 19893 16603 4895 23809 12211 1802 2019 13128 17534 19680 2011 29864 2012 25778 3574 11698 6154 5087 113 28500 6536 12211 27596 3574 15802 20366 10928 2276 1932 25778 12211 14039 5402 25336 113 16576 4895 15299 3574 2050 4002 1112 11280 4384 3574 1112 100 1 +18835 28993 26370 11225 16576 3574 13031 16074 3574 8046 28587 17946 2929 3574 7957 27283 8034 8025 3574 14451 15492 1378 3574 6400 21912 12211 7840 9190 3574 113 32976 13197 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 0 +5681 19185 10960 19640 13757 3574 26281 28961 19640 13757 3574 23740 12211 8526 28961 8106 2019 0 0 6811 11714 12211 30781 12688 8097 18401 3574 18835 31527 20780 8097 3574 2929 23445 28961 20615 2019 4002 18835 11533 5391 3574 9356 8051 28782 11533 19635 3574 6141 8097 12211 14039 19636 28565 28961 9749 12230 2019 11832 30781 8174 3574 6789 4821 1525 26128 15535 15535 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 69 1 +21264 2276 30781 10671 15048 15616 5402 21371 16576 3567 16699 8630 19058 14948 32952 25714 21852 27822 12211 1887 28109 15474 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 0 +4243 19262 16842 16576 3541 12211 18835 3574 2692 755 16477 32986 28587 9218 3541 3574 32842 16060 3574 0 0 31482 2404 8724 4412 3574 19640 18416 19262 14755 3574 20815 18416 13197 19267 30118 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 37 0 +25778 15474 3574 8052 11711 4348 17214 12211 1443 7492 3574 28961 24908 17214 28676 12211 25603 20076 16576 27914 12211 12775 2019 21599 12171 32948 3574 11533 113 27212 3574 6811 25761 16576 517 12211 27519 9679 28676 15343 23841 12211 17399 7492 3574 24193 4002 25569 8311 22868 12211 17399 12705 1525 16727 2019 25778 15168 17214 28676 7636 23443 16576 26761 27322 12211 21949 1525 7492 2019 25710 11698 30781 27424 9679 28676 7636 9860 3574 25710 11698 27424 25569 8311 12211 22868 9860 3574 2521 22063 11698 13600 29969 25778 3574 3830 15740 21725 2019 0 99 1 +25010 6755 2050 21019 2723 12211 14871 15602 12211 8052 25433 3574 29497 19051 15751 2457 26598 113 22703 3574 1470 8630 19640 13757 23773 15168 15423 1470 3574 29941 10126 3045 28109 11533 32111 3574 23112 22512 14039 13459 19268 3574 29812 12211 6356 28961 14451 15492 4412 12211 16576 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 0 +3664 19680 16576 10297 3574 1370 19680 16576 24870 6146 4098 12211 8393 28955 2019 12382 31619 10010 3574 113 16720 24379 2019 14075 19038 4666 3574 22220 12211 30173 1525 581 24073 10784 22108 12211 25341 2019 15896 3574 581 1707 2009 15864 2010 2019 7395 29600 11110 13061 12211 581 30781 22102 12211 2019 14274 8849 17567 31527 7395 11110 14160 113 6611 12211 24188 12211 8016 3574 11670 28961 11533 113 112 2019 13459 16720 3574 4002 25778 12211 30173 1459 3574 8849 17567 12211 30173 19813 3574 3664 14590 24073 32020 31737 3574 6789 1370 22068 100 1 +5681 5391 6742 28961 4694 30781 19038 25096 16576 3574 19692 11533 25096 3574 20921 19488 10010 28587 27474 12497 3574 3880 7574 3574 10272 990 5919 8615 1723 3574 18746 6161 31575 33076 2019 19641 29276 13757 23154 18415 32837 29145 12211 2019 30326 32055 3574 16477 24908 9803 25336 6115 23954 6839 16576 3574 16477 5 28441 17293 5681 24690 18636 32448 12211 5391 6742 3574 19640 32986 21117 25730 3574 4694 30781 28475 1112 16603 1719 15302 2019 0 0 18416 24908 18020 12068 16842 4895 27874 18636 11533 19635 12211 17044 3574 5402 24908 28184 16842 100 0 +33113 28184 5434 3574 22185 9382 3574 6400 21264 1525 18835 820 1459 3574 18835 7574 2018 18746 19813 2018 1378 30781 17386 2050 26208 4895 162 13636 3574 22739 7601 12211 11101 14160 28587 18162 12211 17386 16576 2019 26281 30325 3574 7957 28587 27596 6356 2019 18835 21264 24908 7286 19055 29128 8630 16877 16576 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 58 1 +19185 19593 30325 3574 7513 19838 2019 7317 32737 14988 2809 4352 11225 16576 3574 11711 11138 8052 19041 32063 2019 1887 17399 27886 14095 2019 17946 20104 23445 30325 3574 4002 19640 12133 2509 19267 33022 31510 2580 8046 28587 15492 13282 7823 3574 16710 19838 2019 18835 31527 31463 16576 32353 3574 33140 28587 22665 3574 12133 11280 19267 755 13539 2019 24690 28776 4654 12211 26281 16710 3139 3574 19640 14451 6356 4223 21872 3571 25096 7226 21674 3574 6811 24229 10018 3574 755 24229 24908 20286 5402 9904 21924 25626 2018 8476 32271 12230 16576 3574 100 1 +19289 8452 29183 5202 28961 11533 5039 29183 7648 28961 23443 19350 19048 29183 9051 19350 31881 11578 330 0 0 24294 11248 6735 29183 15374 20914 16576 24294 29183 29832 30863 3835 13783 30827 23504 6277 0 0 1981 12211 12919 10671 750 517 16951 29183 7695 19080 15900 19838 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 1 +17248 13804 28952 8097 10505 22608 3574 26211 6356 32954 23954 12972 16576 3574 20191 10505 3574 31463 26639 16576 3574 8052 18318 3574 19081 21566 31331 14871 8917 13757 3431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 1 +23439 330 5847 22512 22332 0 0 8990 330 24415 115 10155 11533 113 30913 3574 28961 19838 2973 0 0 27771 330 27619 14216 3574 31375 19650 30325 3574 5202 728 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 1 +19470 5619 26217 15168 3561 3649 3574 302 8154 28955 2011 26400 2012 10519 0 0 5390 4242 24215 0 0 2019 10751 26154 19024 3574 18416 13841 20311 21931 3574 840 3097 18670 2019 0 0 13128 18416 25117 15003 26683 19055 23439 19055 21039 0 0 23439 26371 22343 19404 24359 14648 25898 30084 16430 3574 16398 15168 21137 2019 19132 3736 2019 10519 12211 18670 26914 3574 1112 18416 9904 28955 27322 21583 30827 3574 28451 30453 2019 32986 16243 9277 8035 0 0 16603 27801 18162 7192 12211 2019 18416 32986 18162 19058 785 3574 2721 100 0 +1470 19037 3574 8395 28961 728 2019 10544 7902 8630 20615 6154 21599 3574 33140 16240 19362 26345 12211 30781 28981 330 13757 20032 24383 1580 2018 17536 3574 6400 2436 19838 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 1 +17111 3574 19636 450 27801 26545 14572 12211 3574 7628 29655 3574 20444 15535 1470 3255 2436 8452 3574 7961 4876 118 16598 25470 15535 0 0 3814 21872 28587 2521 27593 3574 15565 21773 5233 3574 21163 28961 12760 16576 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42 1 +13841 1390 13354 16690 8965 21853 3574 31375 12211 11499 11533 5391 3574 2882 21862 12211 10126 3814 22325 3574 10196 17208 903 13011 19650 1525 16954 23459 3574 13443 18674 11533 19210 2018 10095 3574 10074 30604 31375 31382 18448 3574 5847 31382 3574 20273 11528 21872 15048 27593 28961 8052 15625 3574 29302 13932 11533 11063 24752 2974 2019 1470 11533 3373 3574 20444 0 0 3574 10126 11533 32111 3574 13757 14876 32193 3574 17399 16413 3644 29747 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 81 1 +1525 26193 22102 12211 16954 0 0 24752 22102 12211 7695 23157 0 0 3574 0 0 14988 28587 3544 946 0 0 3574 11711 3375 20796 9951 13303 2019 1470 23112 3255 11533 16598 3574 29655 11533 15944 3574 28587 18856 24710 32124 12211 8576 2019 10345 20191 18492 16179 2019 24908 7695 18299 10010 3574 6149 6144 28587 31872 27150 12211 12226 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 65 1 +6007 16576 25211 0 0 11154 0 0 5361 3574 31514 32602 15802 20076 12211 28961 30325 3574 28592 7363 24073 18448 22408 3574 22048 16576 22048 5402 9904 7902 3099 6558 16576 5827 20076 3574 2885 26447 23799 7928 3574 16240 8034 28961 9749 10507 27857 18415 28981 2019 6007 16576 32973 12211 32866 18424 31924 11924 3574 17399 28587 28099 22175 3574 22048 19643 29377 3574 32235 22399 8053 13757 11248 6356 22343 14648 28216 19055 3852 3574 6854 15874 11364 8052 6735 3574 10126 11364 11533 24922 3574 9745 29183 11253 5402 13233 15768 18790 3574 29812 100 1 +19048 11533 25412 12211 3574 24908 7648 15243 14160 113 3574 11280 26231 28952 16576 8035 11680 12211 21416 12211 15436 3574 18448 16576 8060 5615 6400 15474 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 28 1 +22056 19185 25801 12211 31906 4002 3561 12861 3567 18174 12211 19037 30083 6223 25575 24908 32273 3574 19362 26231 113 30524 24908 9995 3567 6223 24752 14451 26235 4312 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 0 +24460 13011 3574 10126 21872 7259 3574 13443 14805 8192 3574 23925 18448 29699 5044 8344 12211 8132 9209 3574 6400 32063 13755 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 0 +8052 10505 22608 3574 6400 30196 3574 21163 12542 5603 28952 12211 31035 12211 9376 6988 7170 30325 12211 3567 24193 4002 15661 3495 11231 14673 26370 11738 3574 27822 32986 30781 14719 12211 14039 27126 16576 3574 18416 19680 31035 12211 20932 12211 28961 28109 15474 3574 31482 6356 16191 19640 14451 15492 11738 15535 15535 15535 15535 15535 15535 15535 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 62 0 +23439 0 0 17926 19650 2729 0 0 8990 0 0 29655 12211 17399 29338 113 9119 12211 28015 3574 19641 30781 18416 11030 29655 12211 10710 16912 0 0 27771 0 0 13443 12211 16954 31382 3562 27619 12211 19058 13354 19106 24073 113 8645 0 0 13260 0 0 10126 3814 27645 11364 11533 20245 3574 28353 0 0 31679 0 0 9285 28587 5210 12211 2521 25096 3574 21470 25467 9860 1525 2929 9270 12760 2019 33100 10802 13668 31924 26958 13757 7928 9051 16535 11248 3574 29437 23445 10074 33100 9285 2276 22175 14008 3571 31418 100 1 +5202 17111 3574 22512 31829 3574 18298 26211 6735 11248 6854 8644 1525 1549 33100 13783 2019 24908 10011 19726 10101 8034 11520 1525 20403 14160 32390 12211 16872 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 1 +7288 28955 16576 3574 6594 31382 1872 5213 3574 31482 31382 113 17208 13636 3574 28378 11528 11626 16576 551 20485 15543 23783 15543 6010 551 15535 18423 32698 3574 25778 15038 12211 15567 2019 27555 3574 28378 15605 1525 3128 3574 28961 28378 2050 14343 15622 12211 4298 12211 22047 2019 11110 8034 12211 18456 23126 11226 1122 25260 1525 13119 27564 14160 11533 28892 15874 2011 24497 2012 8034 12211 18456 23126 15535 15535 18416 19680 19643 30788 12211 12211 2011 14086 2012 2011 31461 2012 2011 30712 2012 3574 14160 30325 22421 3567 4895 23617 1525 4351 100 1 +24908 19961 20206 14440 15143 3731 22150 4032 20772 18415 28858 3731 6356 19557 5402 13757 19488 19185 20263 16576 3574 3306 5354 2266 25294 3574 24908 32840 24870 22150 24738 3574 13841 19185 12211 7513 15802 8106 2019 0 0 19185 16719 113 11773 6356 2859 1525 5520 12211 29723 22100 3731 13757 7806 14930 3574 13757 13841 6356 20545 3574 2476 3574 18636 26557 24423 3574 13994 11924 24906 3574 19640 13757 1112 6793 24908 5065 20718 28454 3574 6793 15474 3223 3574 18060 28585 30860 27203 3574 4348 19185 13841 29812 27359 15802 28961 8106 2019 24908 100 1 +18416 30781 24908 12767 15243 21488 16576 4247 12211 30788 12211 3574 25469 26906 24044 28952 12211 31383 12211 28955 2019 32781 16207 18416 23093 30767 15533 3574 17030 12211 14039 3574 32568 18416 665 25778 11231 3574 10648 30781 31383 17111 12211 1443 32365 16576 18416 3574 1112 18416 15573 3574 22068 12211 14039 31044 13934 24908 22068 31893 28955 22102 3574 30788 12211 21290 19608 3574 19893 16603 8376 13566 2019 7961 32568 18416 311 11231 18416 5402 18448 28955 31527 11525 12211 25456 3574 19675 12211 20615 5402 28640 16576 13505 2019 17994 5395 25234 4242 6356 100 1 +18835 29276 13757 3574 11711 7317 10960 18465 3574 7957 5930 28961 15320 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 1 +19185 24690 17911 11533 19635 3574 22185 12211 13260 17460 24908 18835 13757 665 15143 2019 0 0 11711 24690 15250 1525 18582 31143 19038 19638 16576 3574 7513 16716 19838 2019 0 0 27933 29123 19643 755 5022 5233 16060 12906 12133 16842 17226 3574 0 755 32986 28506 14160 113 17226 1843 16842 3574 16460 2692 0 0 12860 8034 24908 19185 20263 11924 16576 8204 6154 6141 6356 11060 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 73 1 +5087 16572 14954 26146 3574 25469 16603 4352 5390 3574 32737 15474 1157 0 0 18416 22185 12211 30781 21560 12211 13987 0 0 18835 18182 11533 6903 3574 11832 16872 3574 27924 3574 31824 30325 0 0 18835 32854 3574 2174 19037 3574 18401 8452 3574 28782 28961 11533 9744 3574 28961 28587 32976 13197 0 0 17294 31906 4002 14435 19038 5391 2019 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 66 1 +24908 18416 8053 9213 23272 10010 3574 18416 1525 28505 5402 810 9904 15907 14160 26232 16576 18602 3574 32568 18416 14675 9904 14455 30860 30827 12211 14039 3574 28505 1378 28144 3574 5004 3574 1411 3574 11698 27861 9616 15227 22421 3574 10671 28952 26620 31382 12211 28955 19893 18416 2019 28955 12211 27766 1525 11110 14160 19433 3574 28505 18202 1112 18416 19893 31383 22068 3574 22068 6356 2011 18416 8052 12068 18232 13508 2012 19362 7703 12211 19042 18232 5411 19037 15564 7676 12211 14039 3574 28505 1378 32053 3574 28966 8035 8052 20077 3574 1870 32325 100 1 +18835 30325 29183 0 0 11711 7957 31377 24807 11713 29183 755 2404 22129 29183 17608 330 22185 23324 29183 0 14988 22 21872 14160 24908 14961 25485 29183 0 0 11711 28961 14451 5390 19680 755 30364 29183 3216 30103 8053 17226 12211 30680 28961 10271 16576 22056 19185 5621 12211 755 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 54 0 +10751 12500 7513 19838 3574 25469 11030 16576 19954 9593 2019 130 24135 19954 9593 30781 4352 23555 12211 19185 3574 26211 8060 17814 5402 6400 26417 14394 14394 12283 22483 12211 5391 3571 8630 32986 30781 28587 12283 3574 9593 5202 19893 16603 6400 23653 26211 10313 10089 21674 2019 18835 14550 28961 16144 7574 16576 2019 28500 22779 12211 30781 18416 24908 28184 18920 5434 11231 3574 665 25664 10372 30781 113 1679 26281 12211 2019 26211 28451 22185 14808 14039 3574 12515 18956 13459 16466 12211 29031 3574 33022 26209 19893 16576 1183 26281 6556 2019 18416 100 0 +4348 28184 7477 4011 3574 15787 22203 4694 8052 15181 2019 4348 25736 5402 30818 4724 8671 3574 18835 7628 2019 31482 19703 30781 5398 18613 12211 2019 118 4210 12211 30781 27156 1110 12211 18835 10000 14061 26281 3574 33022 26209 19893 1183 12211 23025 3574 19640 32986 2276 26205 1183 12211 22052 2276 11760 30860 18338 21017 16108 2019 18416 3366 3574 11309 4378 2875 24661 28961 2276 18338 16576 2019 18416 21855 16576 14061 12211 32868 3574 11859 26209 19893 18416 1183 2019 18416 12211 26205 1183 3574 18416 9904 23025 6347 16576 6143 16576 3574 17281 100 0 +31857 31857 30604 12211 19185 3574 20558 20815 3955 24710 6542 3574 5268 19640 3844 28972 15168 30781 7137 15853 19185 3574 31276 5213 2019 20288 11942 18835 31527 113 31535 18099 3574 6988 14399 26587 14398 12211 3574 30781 17174 20273 16576 8236 11880 5758 15753 12211 3567 3567 3567 28782 28961 19358 3574 16343 18232 24073 13153 3574 22630 2019 2019 2019 7460 12211 22630 5213 3574 25336 12392 14451 30052 28966 2019 17294 18014 12211 30781 9661 19593 3574 3750 10018 12211 17499 12211 20981 4115 2019 0 0 0 0 0 0 0 0 0 0 90 0 +2276 6737 24366 3574 9749 19038 30130 16576 3567 1118 13687 6988 2276 29434 32608 3574 6611 12211 13687 18448 6611 12211 9715 3574 10362 27285 23576 22559 2018 11474 2018 5314 2018 10707 11924 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 1 +21470 18448 19675 12211 15567 3574 10074 2929 130 20245 19058 16576 2019 23445 2436 23954 3574 16687 5391 3574 20403 30604 2019 32557 19640 13757 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 26 1 +10710 30781 18835 19433 3574 8697 9218 16576 19640 5354 6327 12211 29349 29128 3574 3406 16576 19814 12091 2019 17294 7604 12211 4002 18835 26370 14276 2019 19861 20255 25357 16576 8035 3334 3574 25469 8630 24073 113 4352 16523 12211 21264 21017 32365 16603 12211 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 1 +4852 23653 3574 14451 14132 15492 3574 4002 17937 32952 11231 28952 8053 16576 2019 31713 16576 29112 16191 3434 16576 15871 3574 17942 11533 19358 3574 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 1 diff --git a/models/contentunderstanding/textcnn_pretrain/finetune_startup.py b/models/contentunderstanding/textcnn_pretrain/finetune_startup.py new file mode 100644 index 0000000000000000000000000000000000000000..eae331cf1dfe2fe6efc1b8537ccd1e3a404616eb --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/finetune_startup.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import warnings +import os +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddlerec.core.utils import envs +from paddlerec.core.trainers.framework.startup import StartupBase +from paddlerec.core.trainer import EngineMode + +__all__ = ["Startup"] + + +class Startup(StartupBase): + """R + """ + + def __init__(self, context): + self.op_name_scope = "op_namescope" + self.clip_op_name_scope = "@CLIP" + self.op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName( + ) + print("Running FineTuningStartup.") + + def _is_opt_role_op(self, op): + # NOTE: depend on oprole to find out whether this op is for + # optimize + op_maker = core.op_proto_and_checker_maker + optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize + if op_maker.kOpRoleAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role): + return True + return False + + def _get_params_grads(self, program): + """ + Get optimizer operators, parameters and gradients from origin_program + Returns: + opt_ops (list): optimize operators. + params_grads (dict): parameter->gradient. + """ + block = program.global_block() + params_grads = [] + # tmp set to dedup + optimize_params = set() + origin_var_dict = program.global_block().vars + for op in block.ops: + if self._is_opt_role_op(op): + # Todo(chengmo): Whether clip related op belongs to Optimize guard should be discussed + # delete clip op from opt_ops when run in Parameter Server mode + if self.op_name_scope in op.all_attrs( + ) and self.clip_op_name_scope in op.attr(self.op_name_scope): + op._set_attr( + "op_role", + int(core.op_proto_and_checker_maker.OpRole.Backward)) + continue + + if op.attr(self.op_role_var_attr_name): + param_name = op.attr(self.op_role_var_attr_name)[0] + grad_name = op.attr(self.op_role_var_attr_name)[1] + if not param_name in optimize_params: + optimize_params.add(param_name) + params_grads.append([ + origin_var_dict[param_name], + origin_var_dict[grad_name] + ]) + return params_grads + + @staticmethod + def is_persistable(var): + """ + Check whether the given variable is persistable. + + Args: + var(Variable): The variable to be checked. + + Returns: + bool: True if the given `var` is persistable + False if not. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + param = fluid.default_main_program().global_block().var('fc.b') + res = fluid.io.is_persistable(param) + """ + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.READER: + return False + return var.persistable + + def load(self, context, is_fleet=False, main_program=None): + dirname = envs.get_global_env("runner." + context["runner_name"] + + ".init_pretraining_model_path", "") + hotstart_dirname = envs.get_global_env( + "runner." + context["runner_name"] + ".init_model_path", "") + + def existed_params(var): + if not isinstance(var, fluid.framework.Parameter): + return False + if os.path.exists(os.path.join(dirname, var.name)): + print("INIT %s" % var.name) + return True + else: + #print("SKIP %s" % var.name) + return False + + if hotstart_dirname != "": + #If init_model_path exists, hot start is first choice + print("going to load ", hotstart_dirname) + fluid.io.load_persistables( + context["exe"], hotstart_dirname, main_program=main_program) + print("load from {} success".format(hotstart_dirname)) + elif dirname != "": + #If init_pretraining_model_path exists ,pretrained model load parameters + print("going to load ", dirname) + fluid.io.load_vars( + context["exe"], + dirname, + main_program=main_program, + predicate=existed_params) + print("load from {} success".format(dirname)) + else: + #If both of the above are empty, cold start model + return + + def startup(self, context): + for model_dict in context["phases"]: + with fluid.scope_guard(context["model"][model_dict["name"]][ + "scope"]): + train_prog = context["model"][model_dict["name"]][ + "main_program"] + startup_prog = context["model"][model_dict["name"]][ + "startup_program"] + with fluid.program_guard(train_prog, startup_prog): + context["exe"].run(startup_prog) + self.load(context, main_program=train_prog) + context["status"] = "train_pass" diff --git a/models/contentunderstanding/textcnn_pretrain/model.py b/models/contentunderstanding/textcnn_pretrain/model.py new file mode 100644 index 0000000000000000000000000000000000000000..66c66a6b5f055721a450b81cdd70c249c6fa440f --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/model.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +from paddlerec.core.utils import envs +from paddlerec.core.model import ModelBase +from basemodel import embedding + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + self.dict_size = 2000001 + self.max_len = 100 + self.cnn_dim = 128 + self.cnn_filter_size1 = 1 + self.cnn_filter_size2 = 2 + self.cnn_filter_size3 = 3 + self.emb_dim = 128 + self.hid_dim = 96 + self.class_dim = 2 + self.is_sparse = True + + def input_data(self, is_infer=False, **kwargs): + data = fluid.data( + name="input", shape=[None, self.max_len, 1], dtype='int64') + seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') + label = fluid.data(name="label", shape=[None, 1], dtype='int64') + return [data, seq_len, label] + + def net(self, input, is_infer=False): + """ network definition """ + self.data = input[0] + self.seq_len = input[1] + self.label = input[2] + + # embedding layer + emb = embedding(self.data, self.dict_size, self.emb_dim, + self.is_sparse) + emb = fluid.layers.sequence_unpad(emb, length=self.seq_len) + # convolution layer + conv1 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=self.cnn_dim, + filter_size=self.cnn_filter_size1, + act="tanh", + pool_type="max") + + conv2 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=self.cnn_dim, + filter_size=self.cnn_filter_size2, + act="tanh", + pool_type="max") + + conv3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=self.cnn_dim, + filter_size=self.cnn_filter_size3, + act="tanh", + pool_type="max") + + convs_out = fluid.layers.concat(input=[conv1, conv2, conv3], axis=1) + + # full connect layer + fc_1 = fluid.layers.fc(input=convs_out, size=self.hid_dim, act="tanh") + # softmax layer + prediction = fluid.layers.fc(input=[fc_1], + size=self.class_dim, + act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=self.label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=self.label) + + self._cost = avg_cost + if is_infer: + self._infer_results["acc"] = acc + self._infer_results["loss"] = avg_cost + else: + self._metrics["acc"] = acc + self._metrics["loss"] = avg_cost diff --git a/models/contentunderstanding/textcnn_pretrain/reader.py b/models/contentunderstanding/textcnn_pretrain/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..810c5d08698467a42b7b0b46752ff0657b2049c7 --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/reader.py @@ -0,0 +1,43 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +from paddlerec.core.reader import ReaderBase + + +class Reader(ReaderBase): + def init(self): + pass + + def _process_line(self, l): + l = l.strip().split() + data = l[0:100] + seq_len = l[100:101] + label = l[101:] + + return data, label, seq_len + + def generate_sample(self, line): + def data_iter(): + data, label, seq_len = self._process_line(line) + if data is None: + yield None + return + data = [int(i) for i in data] + label = [int(i) for i in label] + seq_len = [int(i) for i in seq_len] + yield [('data', data), ('seq_len', seq_len), ('label', label)] + + return data_iter diff --git a/models/contentunderstanding/textcnn_pretrain/readme.md b/models/contentunderstanding/textcnn_pretrain/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..b6f6f87c3bcd19e6bb8149f9269abe9c87b68f48 --- /dev/null +++ b/models/contentunderstanding/textcnn_pretrain/readme.md @@ -0,0 +1,145 @@ +# 使用文本分类模型作为预训练模型对textcnn模型进行fine-tuning + +以下是本例的简要目录结构及说明: + +``` +├── data #样例数据 + ├── train + ├── train.txt #训练数据样例 + ├── test + ├── test.txt #测试数据样例 + ├── preprocess.py #数据处理程序 +├── __init__.py +├── README.md #文档 +├── model.py #模型文件 +├── basemodel.py #预训练模型 +├── config.yaml #配置文件 +├── reader.py #读取程序 +├── finetune_startup.py #加载参数 +``` + +注:在阅读该示例前,建议您先了解以下内容: +[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md) + + +## 内容 + +- [模型简介](#模型简介) +- [数据准备](#数据准备) +- [运行环境](#运行环境) +- [快速开始](#快速开始) +- [效果复现](#效果复现) +- [进阶使用](#进阶使用) +- [FAQ](#FAQ) + +## 模型简介 +情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。在本文中,我们提供了一个使用大规模的对文章数据进行多分类的textCNN模型(2个卷积核的cnn模型)作为预训练模型。本文会使用这个预训练模型对contentunderstanding目录下的textcnn模型(3个卷积核的cnn模型)进行fine-tuning。本文将预训练模型中的embedding层迁移到了contentunderstanding目录下的textcnn模型中,依然进行情感分析的二分类任务。最终获得了模型准确率上的基本持平以及更快速的收敛 +Yoon Kim在论文[EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf)提出了TextCNN并给出基本的结构。将卷积神经网络CNN应用到文本分类任务,利用多个不同size的kernel来提取句子中的关键信息(类似于多窗口大小的ngram),从而能够更好地捕捉局部相关性。模型的主体结构如图所示: +

+ +

+ +## 数据准备 +情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控,为企业提供有利的决策支持。 +情感是人类的一种高级智能行为,为了识别文本的情感倾向,需要深入的语义建模。另外,不同领域(如餐饮、体育)在情感的表达各不相同,因而需要有大规模覆盖各个领域的数据进行模型训练。为此,我们通过基于深度学习的语义模型和大规模数据挖掘解决上述两个问题。效果上,我们和contentunderstanding目录下的textcnn模型一样基于开源情感倾向分类数据集ChnSentiCorp进行评测。 +您可以直接执行以下命令获取我们的预训练模型(basemodel.py,pretrain_model_params)以及对应的字典(word_dict.txt): +``` +wget https://paddlerec.bj.bcebos.com/textcnn_pretrain%2Fpretrain_model.tar.gz +tar -zxvf textcnn_pretrain%2Fpretrain_model.tar.gz +``` +您可以直接执行以下命令下载我们分词完毕后的数据集,文件解压之后,senta_data目录下会存在训练数据(train.tsv)、开发集数据(dev.tsv)、测试集数据(test.tsv)以及对应的词典(word_dict.txt): +``` +wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz +tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz +``` +数据格式为一句中文的评价语句,和一个代表情感信息的标签。两者之间用/t分隔,中文的评价语句已经分词,词之间用空格分隔。 +``` +15.4寸 笔记本 的 键盘 确实 爽 , 基本 跟 台式机 差不多 了 , 蛮 喜欢 数字 小 键盘 , 输 数字 特 方便 , 样子 也 很 美观 , 做工 也 相当 不错 1 +跟 心灵 鸡汤 没 什么 本质 区别 嘛 , 至少 我 不 喜欢 这样 读 经典 , 把 经典 都 解读 成 这样 有点 去 中国 化 的 味道 了 0 +``` + +## 运行环境 +PaddlePaddle>=1.7.2 + +python 2.7/3.5/3.6/3.7 + +PaddleRec >=0.1 + +os : windows/linux/macos + +## 快速开始 +本文需要下载模型的参数文件和finetune的数据集才可以体现出finetune的效果,所以暂不提供快速一键运行。若想体验finetune的效果,请按照下面【效果复现】模块的步骤依次执行。 + +## 效果复现 +在本模块,我们希望用户可以理解如何使用预训练模型来对自己的模型进行fine-tuning。 +1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/textcnn_pretrain + +2. 下载并解压数据集,命令如下。解压后您可以看到出现senta_data目录 +``` +wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz +tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz +``` + +3. 下载并解压预训练模型,命令如下。 +``` +wget https://paddlerec.bj.bcebos.com/textcnn_pretrain%2Fpretrain_model.tar.gz +tar -zxvf textcnn_pretrain%2Fpretrain_model.tar.gz +``` + +4. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本。在您下载预训练模型后,将word_dict.txt复制到senta_data文件中。您在解压数据集后,将preprocess.py复制到senta_data文件中。 +执行preprocess.py,即可将数据集中提供的dev.tsv,test.tsv,train.tsv按照词典提供的对应关系转化为可直接训练的txt文件.命令如下: +``` +rm -f senta_data/word_dict.txt +cp pretrain_model/word_dict.txt senta_data +cp data/preprocess.py senta_data/ +cd senta_data +python3 preprocess.py +mkdir train +mv train.txt train +mkdir test +mv test.txt test +cd .. +``` + +5. 打开文件config.yaml,更改其中的参数 +将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径) + + +6. 执行命令,开始训练: +``` +python -m paddlerec.run -m ./config.yaml +``` + +7. 运行结果: +``` +PaddleRec: Runner infer_runner Begin +Executor Mode: infer +processor_register begin +Running SingleInstance. +Running SingleNetwork. +Running SingleInferStartup. +Running SingleInferRunner. +load persistables from increment/3 +batch: 1, acc: [0.8828125], loss: [0.35940486] +batch: 2, acc: [0.91796875], loss: [0.24300358] +batch: 3, acc: [0.91015625], loss: [0.2490797] +Infer phase_infer of epoch increment/3 done, use time: 0.78388094902, global metrics: acc=[0.91015625], loss=[0.2490797] +PaddleRec Finish +``` + +## 进阶使用 +在观察完model.py和config.yaml两个文件后,相信大家会发现和之前的模型相比有些改变。本章将详细解析这些改动,方便大家理解并灵活应用到自己的程序中. +1.在model.py中,大家会发现在构建embedding层的时候,直接传参使用了basemodel.py中的embeding层。 +这是因为本文使用了预训练模型(basemodel.py)中embedding层,经过大量语料的训练后的embedding层中本身已经蕴含了大量的先验知识。而这些先验知识对于下游任务,尤其是小数据集来讲,是非常有帮助的。 + +2.在config.yaml中,大家会发现在train_runner中多了startup_class_path和init_pretraining_model_path两个参数。 +参数startup_class_path的作用是自定义训练的流程。我们将在自定义的finetune_startup.py文件中将训练好的参数加载入模型当中。 +参数init_pretraining_model_path的作用就是指明加载参数的路径。若路径下的参数文件和模型中的var具有相同的名字,就会将参数加载进模型当中。 +在您设置init_model_path参数时,程序会优先试图按您设置的路径热启动。当没有init_model_path参数,无法热启动时,程序会试图加载init_pretraining_model_path路径下的参数,进行finetune训练。 +只有在两者均为空的情况下,模型会冷启动从头开始训练。 +若您希望进一步了解自定义流程的操作,可以参考以下内容:[如何添加自定义流程](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/trainer_develop.md#%E5%A6%82%E4%BD%95%E6%B7%BB%E5%8A%A0%E8%87%AA%E5%AE%9A%E4%B9%89%E6%B5%81%E7%A8%8B) + +3.在basemodel.py中,我们准备了embedding,multi_convs,full_connect三个模块供您在有需要时直接import使用。 +相关参数可以从本文提供的预训练模型下载链接里的pretrain_model/pretrain_model_params中找到。 + +## FAQ diff --git a/models/demo/movie_recommand/README.md b/models/demo/movie_recommand/README.md new file mode 100644 index 0000000000000000000000000000000000000000..552806840877c356a26d4a535ffee927a7ae0ed4 --- /dev/null +++ b/models/demo/movie_recommand/README.md @@ -0,0 +1,29 @@ +# PaddleRec 基于 Movielens 数据集的全流程示例 + +## 模型的详细教程可以查阅: [十分钟!全流程!从零搭建推荐系统](https://aistudio.baidu.com/aistudio/projectdetail/559336) + +## 本地运行流程 + +在本地需要安装`PaddleRec`及`PaddlePaddle`,推荐在`Linux` + `python2.7` 环境下执行此demo + +本地运行流程与AiStudio流程基本一致,细节略有区别 + +### 离线训练 +```shell +sh train.sh +``` + +### 离线测试 +```shell +sh offline_test.sh +``` + +### 模拟在线召回 +```shell +sh online_recall.sh +``` + +### 模拟在线排序 +```shell +sh online_rank.sh +``` diff --git a/models/demo/movie_recommand/data_prepare.sh b/models/demo/movie_recommand/data_prepare.sh index f99b5b273b4ed496030cfe46bf228ae32159ee26..bf9812c352dff030e358a78e3bbf9a646058c89c 100644 --- a/models/demo/movie_recommand/data_prepare.sh +++ b/models/demo/movie_recommand/data_prepare.sh @@ -1,13 +1,18 @@ cd data +echo "---> Download movielens 1M data ..." wget http://files.grouplens.org/datasets/movielens/ml-1m.zip +echo "---> Unzip ml-1m.zip ..." unzip ml-1m.zip +rm ml-1m.zip +echo "---> Split movielens data ..." python split.py -mkdir train/ -mkdir test/ +mkdir -p train/ +mkdir -p test/ +echo "---> Process train & test data ..." python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test python process_ml_1m.py hash log.data.train > ./train/data.txt @@ -15,4 +20,6 @@ python process_ml_1m.py hash log.data.test > ./test/data.txt rm log.data.train rm log.data.test -cd ../ +cd .. + +echo "---> Finish data process" diff --git a/models/demo/movie_recommand/offline_test.sh b/models/demo/movie_recommand/offline_test.sh index 88bf29cebf25b185bcdbb13cf64db5b0984b7704..98a04fd1712e53e11633cc5e87327492a47e6213 100644 --- a/models/demo/movie_recommand/offline_test.sh +++ b/models/demo/movie_recommand/offline_test.sh @@ -1,12 +1,15 @@ ## modify config.yaml to infer mode at first -cd recall -python -m paddlerec.run -m ./config.yaml -cd ../rank -python -m paddlerec.run -m ./config.yaml -cd .. +echo "Recall offline test ..." +echo "Model config at models/demo/movie_recommand/recall/config_offline_test.yaml" +python -m paddlerec.run -m ./recall/config_test_offline.yaml + +echo "Rank offline test ..." +echo "Model config at models/demo/movie_recommand/rank/config_offline_test.yaml" +python -m paddlerec.run -m ./rank/config_test_offline.yaml echo "recall offline test result:" python parse.py recall_offline recall/infer_result + echo "rank offline test result:" python parse.py rank_offline rank/infer_result diff --git a/models/demo/movie_recommand/online_rank.sh b/models/demo/movie_recommand/online_rank.sh index f2f5f167493e1c35f824f0bd87a922d25f832191..9a9c376ffcec4581c2c5212f645d1a9aafbdf7a3 100644 --- a/models/demo/movie_recommand/online_rank.sh +++ b/models/demo/movie_recommand/online_rank.sh @@ -1,8 +1,9 @@ cd data +echo "Create online test data ..." python process_ml_1m.py data_rank > online_user/test/data.txt -## modify recall/config.yaml to online_infer mode -cd ../rank -python -m paddlerec.run -m ./config.yaml -cd ../ -python parse.py rank_online rank/infer_result +cd .. +echo "Rank online test ..." +echo "Model config at models/demo/movie_recommand/rank/config_online_test.yaml" +python -m paddlerec.run -m ./rank/config_test_online.yaml +python parse.py rank_online ./rank/infer_result diff --git a/models/demo/movie_recommand/online_recall.sh b/models/demo/movie_recommand/online_recall.sh index 23fa7912c2f173310da7f73694833aeaa59646df..2cd47aa321f213313e2edf279f5e8c9ce8fcdd34 100644 --- a/models/demo/movie_recommand/online_recall.sh +++ b/models/demo/movie_recommand/online_recall.sh @@ -1,9 +1,10 @@ cd data +echo "Create online test data ..." mkdir online_user/test python process_ml_1m.py data_recall > online_user/test/data.txt -## modify recall/config.yaml to online_infer mode -cd ../recall -python -m paddlerec.run -m ./config.yaml -cd ../ +cd .. +echo "Recall online test ..." +echo "Model config at models/demo/movie_recommand/recall/config_online_test.yaml" +python -m paddlerec.run -m ./recall/config_test_online.yaml python parse.py recall_online recall/infer_result diff --git a/models/demo/movie_recommand/rank/config.yaml b/models/demo/movie_recommand/rank/config.yaml index e5834178a98e7132fc85ed25f4a2a509dc979e9c..bce49150488330d0d42e6ad45657f0d4bae3cdba 100644 --- a/models/demo/movie_recommand/rank/config.yaml +++ b/models/demo/movie_recommand/rank/config.yaml @@ -12,28 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -workspace: "models/demo/movie_recommand" +workspace: "./" # list of dataset dataset: - name: dataset_train # name of dataset to distinguish different datasets batch_size: 128 - type: QueueDataset + type: DataLoader data_path: "{workspace}/data/train" sparse_slots: "logid time userid gender age occupation movieid title genres label" dense_slots: "" -- name: dataset_infer # name - batch_size: 128 - type: DataLoader - data_path: "{workspace}/data/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" -- name: dataset_online_infer # name - batch_size: 10 - type: DataLoader - data_path: "{workspace}/data/online_user/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" # hyper parameters of user-defined network hyper_parameters: @@ -51,42 +39,17 @@ hyper_parameters: # train mode: runner_train -## online or offline infer -#mode: runner_infer runner: - name: runner_train class: train save_checkpoint_interval: 1 # save model interval of epochs - save_inference_interval: 1 # save inference - save_checkpoint_path: "increment" # save checkpoint path - save_inference_path: "inference" # save inference path + save_checkpoint_path: "increment_rank" # save checkpoint path epochs: 10 device: cpu -- name: runner_infer - class: infer - print_interval: 10000 - init_model_path: "increment/9" # load model path - #train phase: - name: phase1 - model: "{workspace}/model.py" # user-defined model + model: "{workspace}/rank/model.py" # user-defined model dataset_name: dataset_train # select dataset by name - thread_num: 12 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_online_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 + thread_num: 4 diff --git a/models/demo/movie_recommand/rank/config_test_offline.yaml b/models/demo/movie_recommand/rank/config_test_offline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9281e844c03ae358a1af45b3aa78d1295c0d8b12 --- /dev/null +++ b/models/demo/movie_recommand/rank/config_test_offline.yaml @@ -0,0 +1,60 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#workspace: "paddlerec.models.demo.movie_recommand" +workspace: "./" + +# list of dataset +dataset: +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 10000 + runner_result_dump_path: "{workspace}/rank/infer_result" + init_model_path: "increment_rank/9" # load model path + +#offline infer +phase: +- name: phase1 + model: "{workspace}/rank/model.py" # user-defined model + dataset_name: dataset_infer # select dataset by name + thread_num: 1 + diff --git a/models/demo/movie_recommand/rank/config_test_online.yaml b/models/demo/movie_recommand/rank/config_test_online.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0ade082c4731b9d0d1f81975aa719cd873e150ab --- /dev/null +++ b/models/demo/movie_recommand/rank/config_test_online.yaml @@ -0,0 +1,57 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "./" + +# list of dataset +dataset: +- name: dataset_online_infer # name + batch_size: 10 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 10000 + runner_result_dump_path: "{workspace}/rank/infer_result" + init_model_path: "increment_rank/9" # load model path + +#offline infer +phase: +- name: phase1 + model: "{workspace}/rank/model.py" # user-defined model + dataset_name: dataset_online_infer # select dataset by name + thread_num: 1 + diff --git a/models/demo/movie_recommand/recall/config.yaml b/models/demo/movie_recommand/recall/config.yaml index 63ca1c9c42cc232c4873578991b4534f1aa5f325..852241f3a5f24654cac340d088848faf23c597f1 100644 --- a/models/demo/movie_recommand/recall/config.yaml +++ b/models/demo/movie_recommand/recall/config.yaml @@ -12,28 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -workspace: "models/demo/movie_recommand" +workspace: "./" # list of dataset dataset: - name: dataset_train # name of dataset to distinguish different datasets batch_size: 128 - type: QueueDataset + type: DataLoader data_path: "{workspace}/data/train" sparse_slots: "logid time userid gender age occupation movieid title genres label" dense_slots: "" -- name: dataset_infer # name - batch_size: 128 - type: DataLoader - data_path: "{workspace}/data/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" -- name: dataset_online_infer # name - batch_size: 128 - type: DataLoader - data_path: "{workspace}/data/online_user/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" # hyper parameters of user-defined network hyper_parameters: @@ -50,43 +38,17 @@ hyper_parameters: # train mode: runner_train - -## online or offline infer -#mode: runner_infer runner: - name: runner_train class: train save_checkpoint_interval: 1 # save model interval of epochs - save_inference_interval: 1 # save inference - save_checkpoint_path: "increment" # save checkpoint path - save_inference_path: "inference" # save inference path + save_checkpoint_path: "increment_recall" # save checkpoint path epochs: 10 device: cpu -- name: runner_infer - class: infer - print_interval: 10000 - init_model_path: "increment/9" # load model path - #train phase: - name: phase1 - model: "{workspace}/model.py" # user-defined model + model: "{workspace}/recall/model.py" # user-defined model dataset_name: dataset_train # select dataset by name - thread_num: 12 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_online_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 + thread_num: 4 diff --git a/models/demo/movie_recommand/recall/config_test_offline.yaml b/models/demo/movie_recommand/recall/config_test_offline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c246ad15a767634e569aa6a3fd851ea210d519d2 --- /dev/null +++ b/models/demo/movie_recommand/recall/config_test_offline.yaml @@ -0,0 +1,57 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#workspace: "paddlerec.models.demo.movie_recommand" +workspace: "./" +# list of dataset +dataset: +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 100000 + runner_result_dump_path: "{workspace}/recall/infer_result" + init_model_path: "increment_recall/9" # load model path + + +#offline infer +phase: +- name: phase1 + model: "{workspace}/recall/model.py" # user-defined model + dataset_name: dataset_infer + thread_num: 1 diff --git a/models/demo/movie_recommand/recall/config_test_online.yaml b/models/demo/movie_recommand/recall/config_test_online.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d21f7fe49ef99dc24748209386f9f28fc43be083 --- /dev/null +++ b/models/demo/movie_recommand/recall/config_test_online.yaml @@ -0,0 +1,59 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#workspace: "paddlerec.models.demo.movie_recommand" +workspace: ./ +# list of dataset +dataset: +- name: dataset_online_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 10000 + runner_result_dump_path: "{workspace}/recall/infer_result" + init_model_path: "increment_recall/9" # load model path + +#offline infer +phase: +- name: phase1 + model: "{workspace}/recall/model.py" # user-defined model + dataset_name: dataset_online_infer # select dataset by name + thread_num: 1 + diff --git a/models/demo/movie_recommand/train.sh b/models/demo/movie_recommand/train.sh index 47756c1414030bf3cd5da0532198eedf19eff3e0..ad32edda3036e22ef9eee86946456d67e1f42f59 100644 --- a/models/demo/movie_recommand/train.sh +++ b/models/demo/movie_recommand/train.sh @@ -1,5 +1,8 @@ -cd recall -python -m paddlerec.run -m ./config.yaml &> log & -cd ../rank -python -m paddlerec.run -m ./config.yaml &> log & -cd .. +echo "Recall offline training ..." +echo "Model config at models/demo/movie_recommand/recall/config.yaml" +python -m paddlerec.run -m ./recall/config.yaml + +echo "----------------------------------------" +echo "Rank offline training ..." +echo "Model config at models/demo/movie_recommand/rank/config.yaml" +python -m paddlerec.run -m ./rank/config.yaml diff --git a/models/multitask/esmm/README.md b/models/multitask/esmm/README.md index 91a1df7644f0768885030fc8fd0343d891ba29d1..aecd9edaacf18204c2ed6199ee842285767bac48 100644 --- a/models/multitask/esmm/README.md +++ b/models/multitask/esmm/README.md @@ -50,11 +50,6 @@ ESMM是发表在 SIGIR’2018 的论文[《Entire Space Multi-Task Model: An E 数据地址:[Ali-CCP:Alibaba Click and Conversion Prediction]( https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408 ) -``` -cd data -sh run.sh -``` - 数据格式参见demo数据:data/train @@ -108,11 +103,25 @@ CPU环境 ## 论文复现 -用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=1000, thread_num=8, epoch_num=4 +由于原论文的数据太大,我们选取了部分数据作为训练和测试数据, 建议使用gpu训练。 + +我们的测试ctr auc为0.79+,ctcvr auc为0.82+。 +``` +wget https://paddlerec.bj.bcebos.com/esmm/traindata_10w.csv +wget https://paddlerec.bj.bcebos.com/esmm/testdata_10w.csv +mkdir data/train_data data/test_data +mv traindata_10w.csv data/train_data +mv testdata_10w.csv data/test_data +``` -修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行 +用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=1024, epoch=10, device=gpu, selected_gpus:"0" +具体配置可以下载config_10w.yaml文件 +``` +wget https://paddlerec.bj.bcebos.com/esmm/config_10w.yaml +``` +修改后运行 ``` python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径 ``` diff --git a/models/multitask/esmm/config.yaml b/models/multitask/esmm/config.yaml index 2a4478baa2052d03d6dd3699bc13ebba90583176..25cfbe9168e1c3b5aaaf8adce4e5ed3b6f4bb8f0 100644 --- a/models/multitask/esmm/config.yaml +++ b/models/multitask/esmm/config.yaml @@ -17,19 +17,19 @@ workspace: "models/multitask/esmm" dataset: - name: dataset_train - batch_size: 1 + batch_size: 5 type: QueueDataset data_path: "{workspace}/data/train" data_converter: "{workspace}/esmm_reader.py" - name: dataset_infer - batch_size: 1 + batch_size: 5 type: QueueDataset data_path: "{workspace}/data/test" data_converter: "{workspace}/esmm_reader.py" hyper_parameters: - vocab_size: 10000 - embed_size: 128 + vocab_size: 737946 + embed_size: 12 optimizer: class: adam learning_rate: 0.001 @@ -43,15 +43,15 @@ runner: class: train device: cpu epochs: 3 - save_checkpoint_interval: 2 + save_checkpoint_interval: 1 save_inference_interval: 4 - save_checkpoint_path: "increment" + save_checkpoint_path: "increment_esmm" save_inference_path: "inference" print_interval: 10 phases: [train] - name: infer_runner class: infer - init_model_path: "increment/1" + init_model_path: "increment_esmm/1" device: cpu print_interval: 1 phases: [infer] diff --git a/models/multitask/mmoe/config.yaml b/models/multitask/mmoe/config.yaml index d22b78e4481be78787df5aa828961af411cbc17b..354bd218a9e63eeaa7657b2d77c94d9507a3e8bc 100644 --- a/models/multitask/mmoe/config.yaml +++ b/models/multitask/mmoe/config.yaml @@ -17,12 +17,12 @@ workspace: "models/multitask/mmoe" dataset: - name: dataset_train batch_size: 5 - type: QueueDataset + type: DataLoader # or QueueDataset data_path: "{workspace}/data/train" data_converter: "{workspace}/census_reader.py" - name: dataset_infer batch_size: 5 - type: QueueDataset + type: DataLoader # or QueueDataset data_path: "{workspace}/data/train" data_converter: "{workspace}/census_reader.py" @@ -37,7 +37,6 @@ hyper_parameters: learning_rate: 0.001 strategy: async -#use infer_runner mode and modify 'phase' below if infer mode: [train_runner, infer_runner] runner: @@ -49,10 +48,10 @@ runner: save_inference_interval: 4 save_checkpoint_path: "increment" save_inference_path: "inference" - print_interval: 10 + print_interval: 1 - name: infer_runner class: infer - init_model_path: "increment/0" + init_model_path: "increment/1" device: cpu phase: diff --git a/models/rank/dnn/README.md b/models/rank/dnn/README.md index 9656adc655d6e2d8931861a492af3583906f98f5..d4167777220a12fc3d59c87a01cdf8dcac7dae4d 100644 --- a/models/rank/dnn/README.md +++ b/models/rank/dnn/README.md @@ -259,3 +259,133 @@ auc_var, batch_auc_var, auc_states = fluid.layers.auc( ``` 完成上述组网后,我们最终可以通过训练拿到`avg_cost`与`auc`两个重要指标。 + + +## 流式训练(OnlineLearning)任务启动及配置流程 + +### 流式训练简介 +流式训练是按照一定顺序进行数据的接收和处理,每接收一个数据,模型会对它进行预测并对当前模型进行更新,然后处理下一个数据。 像信息流、小视频、电商等场景,每天都会新增大量的数据, 让每天(每一刻)新增的数据基于上一天(上一刻)的模型进行新的预测和模型更新。 + +在大规模流式训练场景下, 需要使用的深度学习框架有对应的能力支持, 即: +* 支持大规模分布式训练的能力, 数据量巨大, 需要有良好的分布式训练及扩展能力,才能满足训练的时效要求 +* 支持超大规模的Embedding, 能够支持十亿甚至千亿级别的Embedding, 拥有合理的参数输出的能力,能够快速输出模型参数并和线上其他系统进行对接 +* Embedding的特征ID需要支持HASH映射,不要求ID的编码,能够自动增长及控制特征的准入(原先不存在的特征可以以适当的条件创建), 能够定期淘汰(能够以一定的策略进行过期的特征的清理) 并拥有准入及淘汰策略 +* 最后就是要基于框架开发一套完备的流式训练的 trainer.py, 能够拥有完善的流式训练流程 + +### 使用ctr-dnn online learning 进行模型的训练 +目前,PaddleRec基于飞桨分布式训练框架的能力,实现了这套流式训练的流程。 供大家参考和使用。我们基于`models/rank/ctr-dnn`修改了一个online_training的版本,供大家更好的理解和参考。 + +**注意** +1. 使用online learning 需要安装目前Paddle最新的开发者版本, 你可以从 https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev 此处获得它,需要先卸载当前已经安装的飞桨版本,根据自己的Python环境下载相应的安装包。 +2. 使用online learning 需要安装目前PaddleRec最新的开发者版本, 你可以通过 git clone https://github.com/PaddlePaddle/PaddleRec.git 得到最新版的PaddleRec并自行安装 + +### 启动方法 +1. 修改config.yaml中的 hyper_parameters.distributed_embedding=1,表示打开大规模稀疏的模式 +2. 修改config.yaml中的 mode: [single_cpu_train, single_cpu_infer] 中的 `single_cpu_train` 为online_learning_cluster,表示使用online learning对应的运行模式 +3. 准备训练数据, ctr-dnn中使用的online learning对应的训练模式为 天级别训练, 每天又分为24个小时, 因此训练数据需要 天--小时的目录结构进行整理。 + 以 2020年08月10日 到 2020年08月11日 2天的训练数据举例, 用户需要准备的数据的目录结构如下: + ``` + train_data/ + |-- 20200810 + | |-- 00 + | | `-- train.txt + | |-- 01 + | | `-- train.txt + | |-- 02 + | | `-- train.txt + | |-- 03 + | | `-- train.txt + | |-- 04 + | | `-- train.txt + | |-- 05 + | | `-- train.txt + | |-- 06 + | | `-- train.txt + | |-- 07 + | | `-- train.txt + | |-- 08 + | | `-- train.txt + | |-- 09 + | | `-- train.txt + | |-- 10 + | | `-- train.txt + | |-- 11 + | | `-- train.txt + | |-- 12 + | | `-- train.txt + | |-- 13 + | | `-- train.txt + | |-- 14 + | | `-- train.txt + | |-- 15 + | | `-- train.txt + | |-- 16 + | | `-- train.txt + | |-- 17 + | | `-- train.txt + | |-- 18 + | | `-- train.txt + | |-- 19 + | | `-- train.txt + | |-- 20 + | | `-- train.txt + | |-- 21 + | | `-- train.txt + | |-- 22 + | | `-- train.txt + | `-- 23 + | `-- train.txt + `-- 20200811 + |-- 00 + | `-- train.txt + |-- 01 + | `-- train.txt + |-- 02 + | `-- train.txt + |-- 03 + | `-- train.txt + |-- 04 + | `-- train.txt + |-- 05 + | `-- train.txt + |-- 06 + | `-- train.txt + |-- 07 + | `-- train.txt + |-- 08 + | `-- train.txt + |-- 09 + | `-- train.txt + |-- 10 + | `-- train.txt + |-- 11 + | `-- train.txt + |-- 12 + | `-- train.txt + |-- 13 + | `-- train.txt + |-- 14 + | `-- train.txt + |-- 15 + | `-- train.txt + |-- 16 + | `-- train.txt + |-- 17 + | `-- train.txt + |-- 18 + | `-- train.txt + |-- 19 + | `-- train.txt + |-- 20 + | `-- train.txt + |-- 21 + | `-- train.txt + |-- 22 + | `-- train.txt + `-- 23 + `-- train.txt + ``` +4. 准备好数据后, 即可按照标准的训练流程进行流式训练了 + ```shell + python -m paddlerec.run -m models/rerank/ctr-dnn/config.yaml + ``` diff --git a/models/rank/dnn/config.yaml b/models/rank/dnn/config.yaml index aa84a5070470cba750f7832644a9ce676c1d4ddd..75826684dbc0734e4acf40983bbc837c7b97ac84 100755 --- a/models/rank/dnn/config.yaml +++ b/models/rank/dnn/config.yaml @@ -49,6 +49,7 @@ hyper_parameters: sparse_feature_dim: 9 dense_input_dim: 13 fc_sizes: [512, 256, 128, 32] + distributed_embedding: 0 # select runner by name mode: [single_cpu_train, single_cpu_infer] @@ -90,6 +91,18 @@ runner: print_interval: 1 phases: [phase1] +- name: online_learning_cluster + class: cluster_train + runner_class_path: "{workspace}/online_learning_runner.py" + epochs: 2 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 # save model interval of epochs + save_checkpoint_path: "increment_dnn" # save checkpoint path + init_model_path: "" # load model path + print_interval: 1 + phases: [phase1] + - name: collective_cluster class: cluster_train epochs: 2 @@ -101,6 +114,23 @@ runner: print_interval: 1 phases: [phase1] +- name: single_multi_gpu_train + class: train + # num of epochs + epochs: 1 + # device to run training or infer + device: gpu + selected_gpus: "0,1" # 选择多卡执行训练 + save_checkpoint_interval: 1 # save model interval of epochs + save_inference_interval: 4 # save inference + save_step_interval: 1 + save_checkpoint_path: "increment_dnn" # save checkpoint path + save_inference_path: "inference" # save inference path + save_step_path: "step_save" + save_inference_feed_varnames: [] # feed vars of save inference + save_inference_fetch_varnames: [] # fetch vars of save inference + print_interval: 1 + phases: [phase1] # runner will run all the phase in each epoch phase: - name: phase1 diff --git a/models/rank/dnn/model.py b/models/rank/dnn/model.py index ac6f0b946ef4dfd753b79d50a6ec34d099298698..b614934bebcbec342ecd6e711d3864d6ad506faa 100755 --- a/models/rank/dnn/model.py +++ b/models/rank/dnn/model.py @@ -25,8 +25,16 @@ class Model(ModelBase): ModelBase.__init__(self, config) def _init_hyper_parameters(self): - self.is_distributed = True if envs.get_fleet_mode().upper( - ) == "PSLIB" else False + self.is_distributed = False + self.distributed_embedding = False + + if envs.get_fleet_mode().upper() == "PSLIB": + self.is_distributed = True + + if envs.get_global_env("hyper_parameters.distributed_embedding", + 0) == 1: + self.distributed_embedding = True + self.sparse_feature_number = envs.get_global_env( "hyper_parameters.sparse_feature_number") self.sparse_feature_dim = envs.get_global_env( @@ -40,14 +48,26 @@ class Model(ModelBase): self.label_input = self._sparse_data_var[0] def embedding_layer(input): - emb = fluid.layers.embedding( - input=input, - is_sparse=True, - is_distributed=self.is_distributed, - size=[self.sparse_feature_number, self.sparse_feature_dim], - param_attr=fluid.ParamAttr( - name="SparseFeatFactors", - initializer=fluid.initializer.Uniform()), ) + if self.distributed_embedding: + emb = fluid.contrib.layers.sparse_embedding( + input=input, + size=[ + self.sparse_feature_number, self.sparse_feature_dim + ], + param_attr=fluid.ParamAttr( + name="SparseFeatFactors", + initializer=fluid.initializer.Uniform())) + else: + emb = fluid.layers.embedding( + input=input, + is_sparse=True, + is_distributed=self.is_distributed, + size=[ + self.sparse_feature_number, self.sparse_feature_dim + ], + param_attr=fluid.ParamAttr( + name="SparseFeatFactors", + initializer=fluid.initializer.Uniform())) emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') return emb_sum diff --git a/models/rank/dnn/online_learning_runner.py b/models/rank/dnn/online_learning_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..fa4e505ba5ff8b77c95a05c0a4f190ccb8b1909b --- /dev/null +++ b/models/rank/dnn/online_learning_runner.py @@ -0,0 +1,89 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import time +import warnings +import numpy as np +import logging +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.metric import Metric +from paddlerec.core.trainers.framework.runner import RunnerBase + +logging.basicConfig( + format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO) + + +class OnlineLearningRunner(RunnerBase): + def __init__(self, context): + print("Running OnlineLearningRunner.") + + def run(self, context): + epochs = int( + envs.get_global_env("runner." + context["runner_name"] + + ".epochs")) + model_dict = context["env"]["phase"][0] + model_class = context["model"][model_dict["name"]]["model"] + metrics = model_class._metrics + + dataset_list = [] + dataset_index = 0 + for day_index in range(len(days)): + day = days[day_index] + cur_path = "%s/%s" % (path, str(day)) + filelist = fleet.split_files(hdfs_ls([cur_path])) + dataset = create_dataset(use_var, filelist) + dataset_list.append(dataset) + dataset_index += 1 + + dataset_index = 0 + for epoch in range(len(days)): + day = days[day_index] + begin_time = time.time() + result = self._run(context, model_dict) + end_time = time.time() + seconds = end_time - begin_time + message = "epoch {} done, use time: {}".format(epoch, seconds) + + # TODO, wait for PaddleCloudRoleMaker supports gloo + from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker + if context["fleet"] is not None and isinstance(context["fleet"], + GeneralRoleMaker): + metrics_result = [] + for key in metrics: + if isinstance(metrics[key], Metric): + _str = metrics[key].calc_global_metrics( + context["fleet"], + context["model"][model_dict["name"]]["scope"]) + metrics_result.append(_str) + elif result is not None: + _str = "{}={}".format(key, result[key]) + metrics_result.append(_str) + if len(metrics_result) > 0: + message += ", global metrics: " + ", ".join(metrics_result) + print(message) + with fluid.scope_guard(context["model"][model_dict["name"]][ + "scope"]): + train_prog = context["model"][model_dict["name"]][ + "main_program"] + startup_prog = context["model"][model_dict["name"]][ + "startup_program"] + with fluid.program_guard(train_prog, startup_prog): + self.save(epoch, context, True) + + context["status"] = "terminal_pass" diff --git a/models/rank/fibinet/config.yaml b/models/rank/fibinet/config.yaml index 4f0951682e4e96c2fb7c4c373a56de4e6d6bc951..d9ae592f3321d46aa221648d309b8d2e2944f53a 100644 --- a/models/rank/fibinet/config.yaml +++ b/models/rank/fibinet/config.yaml @@ -102,9 +102,9 @@ phase: - name: phase1 model: "{workspace}/model.py" # user-defined model dataset_name: dataloader_train # select dataset by name - thread_num: 8 + thread_num: 1 - name: phase2 model: "{workspace}/model.py" # user-defined model dataset_name: dataset_infer # select dataset by name - thread_num: 8 + thread_num: 1 diff --git a/models/treebased/tdm/build_tree.md b/models/treebased/tdm/build_tree.md new file mode 100644 index 0000000000000000000000000000000000000000..37ecb68f8bd48e3583a843b68c64ee8c1bd08f38 --- /dev/null +++ b/models/treebased/tdm/build_tree.md @@ -0,0 +1,19 @@ + + +wget https://paddlerec.bj.bcebos.com/utils/tree_build_utils.tar.gz --no-check-certificate + +# input_path: embedding的路径 +# emb_shape: embedding中key-value,value的维度 +# emb格式要求: embedding_id(int64),embedding(float),embedding(float),......,embedding(float) +# cluster_threads: 建树聚类所用线程 +python_172_anytree/bin/python -u main.py --input_path=./gen_emb/item_emb.txt --output_path=./ --emb_shape=24 --cluster_threads=4 + +建树流程是:1、读取emb -> 2、kmeans聚类 -> 3、聚类结果整理为树 -> 4、基于树结构得到模型所需的4个文件 + 1 Layer_list:记录了每一层都有哪些节点。训练用 + 2 Travel_list:记录每个叶子节点的Travel路径。训练用 + 3 Tree_Info:记录了每个节点的信息,主要为:是否是item/item_id,所在层级,父节点,子节点。检索用 + 4 Tree_Embedding:记录所有节点的Embedding。训练及检索用 + +注意一下训练数据输入的item是建树之前用的item id,还是基于树的node id,还是基于叶子的leaf id,在tdm_reader.py中,可以加载字典,做映射。 +用厂内版建树得到的输出文件夹里,有名为id2nodeid.txt的映射文件,格式是『hash值』+ 『树节点ID』+『叶子节点ID(表示第几个叶子节点,tdm_sampler op 所需的输入)』 +在另一个id2bidword.txt中,也有映射关系,格式是『hash值』+『原始item ID』,这个文件中仅存储了叶子节点的信息。 diff --git a/tools/build_script.sh b/tools/build_script.sh index b39b97f829a23fd0e79b0d50aad5996bb76941f3..15e97f254a4042dd43d5973cec164bbd5198c2f1 100755 --- a/tools/build_script.sh +++ b/tools/build_script.sh @@ -49,7 +49,7 @@ function model_test() { root_dir=`pwd` all_model=$(find ${root_dir} -name config.yaml) - special_models=("demo" "pnn" "fgcnn" "gru4rec" "tagspace") + special_models=("demo" "pnn" "fgcnn" "gru4rec" "tagspace" "textcnn_pretrain") for model in ${all_model} do