From 5cd75923cb57037bf01b3ce616c540462168ee11 Mon Sep 17 00:00:00 2001 From: Chengmo Date: Fri, 12 Jun 2020 14:12:23 +0800 Subject: [PATCH] update Readme table & yaml doc (#70) * update table * update * update tdm origin * update link * update yaml * update doc & tdm * fix tdm multi gpu support --- README.md | 60 ++++++++++++++--------------- doc/yaml.md | 51 +++++++++++++++--------- models/treebased/tdm/README.md | 8 ++++ models/treebased/tdm/config.yaml | 2 + models/treebased/tdm/model.py | 58 +++++++++++++++++++++------- models/treebased/tdm/tdm_startup.py | 11 +++++- 6 files changed, 126 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index e121bd96..94a37e68 100644 --- a/README.md +++ b/README.md @@ -31,36 +31,36 @@ - 包含内容理解、匹配、召回、排序、 多任务、重排序等多个任务的完整推荐搜索算法库 - | 方向 | 模型 | 单机CPU训练 | 单机GPU训练 | 分布式CPU训练 | - | :------: | :-----------------------------------------------------------------------: | :---------: | :---------: | :-----------: | - | 内容理解 | [Text-Classifcation](models/contentunderstanding/classification/model.py) | ✓ | x | ✓ | - | 内容理解 | [TagSpace](models/contentunderstanding/tagspace/model.py) | ✓ | x | ✓ | - | 匹配 | [DSSM](models/match/dssm/model.py) | ✓ | x | ✓ | - | 匹配 | [MultiView-Simnet](models/match/multiview-simnet/model.py) | ✓ | x | ✓ | - | 召回 | [TDM](models/treebased/tdm/model.py) | ✓ | x | ✓ | - | 召回 | [fasttext](models/recall/fasttext/model.py) | ✓ | x | x | - | 召回 | [Word2Vec](models/recall/word2vec/model.py) | ✓ | x | ✓ | - | 召回 | [SSR](models/recall/ssr/model.py) | ✓ | ✓ | ✓ | - | 召回 | [Gru4Rec](models/recall/gru4rec/model.py) | ✓ | ✓ | ✓ | - | 召回 | [Youtube_dnn](models/recall/youtube_dnn/model.py) | ✓ | ✓ | ✓ | - | 召回 | [NCF](models/recall/ncf/model.py) | ✓ | ✓ | ✓ | - | 排序 | [Logistic Regression](models/rank/logistic_regression/model.py) | ✓ | x | ✓ | - | 排序 | [Dnn](models/rank/dnn/model.py) | ✓ | x | ✓ | - | 排序 | [FM](models/rank/fm/model.py) | ✓ | x | ✓ | - | 排序 | [FFM](models/rank/ffm/model.py) | ✓ | x | ✓ | - | 排序 | [Pnn](models/rank/pnn/model.py) | ✓ | x | ✓ | - | 排序 | [DCN](models/rank/dcn/model.py) | ✓ | x | ✓ | - | 排序 | [NFM](models/rank/nfm/model.py) | ✓ | x | ✓ | - | 排序 | [AFM](models/rank/afm/model.py) | ✓ | x | ✓ | - | 排序 | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | - | 排序 | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | - | 排序 | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | - | 排序 | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | - | 排序 | [FGCNN](models/rank/fgcnn/model.py) | ✓ | x | ✓ | - | 多任务 | [ESMM](models/multitask/esmm/model.py) | ✓ | ✓ | ✓ | - | 多任务 | [MMOE](models/multitask/mmoe/model.py) | ✓ | ✓ | ✓ | - | 多任务 | [ShareBottom](models/multitask/share-bottom/model.py) | ✓ | ✓ | ✓ | - | 重排序 | [Listwise](models/rerank/listwise/model.py) | ✓ | x | ✓ | + | 方向 | 模型 | 单机CPU | 单机GPU | 分布式CPU | 分布式GPU | 模型来源 | + | :------: | :-----------------------------------------------------------------------: | :-----: | :-----: | :-------: | :-------: | :--------------------------------------------------------------------------------------------------------------------------------------: | + | 内容理解 | [Text-Classifcation](models/contentunderstanding/classification/model.py) | ✓ | x | ✓ | x | / | + | 内容理解 | [TagSpace](models/contentunderstanding/tagspace/model.py) | ✓ | x | ✓ | x | / | + | 匹配 | [DSSM](models/match/dssm/model.py) | ✓ | x | ✓ | x | / | + | 匹配 | [MultiView-Simnet](models/match/multiview-simnet/model.py) | ✓ | x | ✓ | x | / | + | 召回 | [TDM](models/treebased/tdm/model.py) | ✓ | >=1.8.0 | ✓ | >=1.8.0 | [[KDD 2018](https://www.kdd.org/kdd2018/)][Learning Tree-based Deep Model for Recommender Systems](https://arxiv.org/pdf/1801.02294.pdf) | + | 召回 | [fasttext](models/recall/fasttext/model.py) | ✓ | x | x | x | / | + | 召回 | [Word2Vec](models/recall/word2vec/model.py) | ✓ | x | ✓ | x | / | + | 召回 | [SSR](models/recall/ssr/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 召回 | [Gru4Rec](models/recall/gru4rec/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 召回 | [Youtube_dnn](models/recall/youtube_dnn/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 召回 | [NCF](models/recall/ncf/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 排序 | [Logistic Regression](models/rank/logistic_regression/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [Dnn](models/rank/dnn/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 排序 | [FM](models/rank/fm/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 排序 | [FFM](models/rank/ffm/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [Pnn](models/rank/pnn/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [DCN](models/rank/dcn/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [NFM](models/rank/nfm/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [AFM](models/rank/afm/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | x | / | + | 排序 | [FGCNN](models/rank/fgcnn/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 多任务 | [ESMM](models/multitask/esmm/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 多任务 | [MMOE](models/multitask/mmoe/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 多任务 | [ShareBottom](models/multitask/share-bottom/model.py) | ✓ | ✓ | ✓ | ✓ | / | + | 重排序 | [Listwise](models/rerank/listwise/model.py) | ✓ | x | ✓ | x | / | diff --git a/doc/yaml.md b/doc/yaml.md index 256c1ec7..9ebbf719 100644 --- a/doc/yaml.md +++ b/doc/yaml.md @@ -2,30 +2,41 @@ ## 全局变量 - | 名称 | 类型 | 取值 | 是否必须 | 作用描述 | - | :-------: | :----: | :-----------------------------------: | :------: | :------------------------------------------------: | - | workspace | string | 路径 / paddlerec.models.{方向}.{模型} | 是 | 指定model/reader/data所在位置 | - | mode | string | runner名称 | 是 | 指定当次运行使用哪个runner | - | debug | bool | True / False | 否 | 当dataset.mode=QueueDataset时,开启op耗时debug功能 | + | 名称 | 类型 | 取值 | 是否必须 | 作用描述 | + | :-------: | :-------------------: | :---------------------------------------------------: | :------: | :------------------------------------------------: | + | workspace | string | 绝对路径 或 paddlerec.models.{方向}.{模型} | 是 | 指定model/reader/data所在位置 | + | mode | string / list[string] | string:单个runner的名称 / list:多个runner名称的列表 | 是 | 指定当次运行使用哪些runner | + | debug | bool | True / False | 否 | 当dataset.mode=QueueDataset时,开启op耗时debug功能 | ## runner变量 -| 名称 | 类型 | 取值 | 是否必须 | 作用描述 | -| :---------------------------: | :----------: | :-------------------------------: | :------: | :---------------------------------------------: | -| name | string | 任意 | 是 | 指定runner名称 | -| class | string | single_train(默认) / single_infer | 是 | 指定运行runner的类别(单机/分布式, 训练/预测) | -| device | string | cpu(默认) / gpu | 否 | 程序执行设备 | -| epochs | int | >= 1 | 否 | 模型训练迭代轮数 | -| init_model_path | string | 路径 | 否 | 初始化模型地址 | -| save_checkpoint_interval | int | >= 1 | 否 | Save参数的轮数间隔 | -| save_checkpoint_path | string | 路径 | 否 | Save参数的地址 | -| save_inference_interval | int | >= 1 | 否 | Save预测模型的轮数间隔 | -| save_inference_path | string | 路径 | 否 | Save预测模型的地址 | -| save_inference_feed_varnames | list[string] | 组网中指定Variable的name | 否 | 预测模型的入口变量name | -| save_inference_fetch_varnames | list[string] | 组网中指定Variable的name | 否 | 预测模型的出口变量name | -| print_interval | int | >= 1 | 否 | 训练指标打印batch间隔 | +| 名称 | 类型 | 取值 | 是否必须 | 作用描述 | +| :---------------------------: | :----------: | :-------------------------------------------: | :------: | :------------------------------------------------------------------: | +| name | string | 任意 | 是 | 指定runner名称 | +| class | string | train(默认) / infer / local_cluster / cluster | 是 | 指定运行runner的类别(单机/分布式, 训练/预测) | +| device | string | cpu(默认) / gpu | 否 | 程序执行设备 | +| fleet_mode | string | ps(默认) / pslib / collective | 否 | 分布式运行模式 | +| selsected_gpus | string | "0"(默认) | 否 | 程序运行GPU卡号,若以"0,1"的方式指定多卡,则会默认启用collective模式 | +| worker_num | int | 1(默认) | 否 | 参数服务器模式下worker的数量 | +| server_num | int | 1(默认) | 否 | 参数服务器模式下server的数量 | +| distribute_strategy | string | async(默认)/sync/half_async/geo | 否 | 参数服务器模式下训练模式的选择 | +| epochs | int | >= 1 | 否 | 模型训练迭代轮数 | +| phases | list[string] | 由phase name组成的list | 否 | 当前runner的训练过程列表,顺序执行 | +| init_model_path | string | 路径 | 否 | 初始化模型地址 | +| save_checkpoint_interval | int | >= 1 | 否 | Save参数的轮数间隔 | +| save_checkpoint_path | string | 路径 | 否 | Save参数的地址 | +| save_inference_interval | int | >= 1 | 否 | Save预测模型的轮数间隔 | +| save_inference_path | string | 路径 | 否 | Save预测模型的地址 | +| save_inference_feed_varnames | list[string] | 组网中指定Variable的name | 否 | 预测模型的入口变量name | +| save_inference_fetch_varnames | list[string] | 组网中指定Variable的name | 否 | 预测模型的出口变量name | +| print_interval | int | >= 1 | 否 | 训练指标打印batch间隔 | +| instance_class_path | string | 路径 | 否 | 自定义instance流程实现的地址 | +| network_class_path | string | 路径 | 否 | 自定义network流程实现的地址 | +| startup_class_path | string | 路径 | 否 | 自定义startup流程实现的地址 | +| runner_class_path | string | 路径 | 否 | 自定义runner流程实现的地址 | +| terminal_class_path | string | 路径 | 否 | 自定义terminal流程实现的地址 | @@ -38,6 +49,7 @@ | dataset_name | string | dataset名称 | 是 | 指定使用哪个Reader | | thread_num | int | >= 1 | 否 | 模型训练线程数 | + ## dataset变量 | 名称 | 类型 | 取值 | 是否必须 | 作用描述 | @@ -50,6 +62,7 @@ | sparse_slots | string | string | 否 | 指定稀疏参数选项 | | dense_slots | string | string | 否 | 指定稠密参数选项 | + ## hyper_parameters变量 | 名称 | 类型 | 取值 | 是否必须 | 作用描述 | | :---------------------: | :----: | :--------------: | :------: | :-------------------------: | diff --git a/models/treebased/tdm/README.md b/models/treebased/tdm/README.md index 220b9048..e70b8f80 100644 --- a/models/treebased/tdm/README.md +++ b/models/treebased/tdm/README.md @@ -2,6 +2,14 @@ 本代码仅作tdm组网示例,使用fake数据集,用于快速调研paddle-tdm。 +## 运行方法 + +1. 运行单机流程,得到init_model +2. 基于单机模型,可以进行分布式的参数服务器训练 + +```shell +python -m paddlerec.run -m paddlerec.models.treebased.tdm +``` ## 树结构的准备 ### 名词概念 diff --git a/models/treebased/tdm/config.yaml b/models/treebased/tdm/config.yaml index 6e3d78e3..74e82bba 100755 --- a/models/treebased/tdm/config.yaml +++ b/models/treebased/tdm/config.yaml @@ -78,6 +78,7 @@ runner: save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path print_interval: 10 + - name: runner2 class: single_infer startup_class_path: "{workspace}/tdm_startup.py" @@ -87,6 +88,7 @@ runner: device: cpu init_model_path: "increment/0" # load model path print_interval: 1 + - name: runner3 class: local_cluster_train startup_class_path: "{workspace}/tdm_startup.py" diff --git a/models/treebased/tdm/model.py b/models/treebased/tdm/model.py index 06110e8c..6ad9edc9 100755 --- a/models/treebased/tdm/model.py +++ b/models/treebased/tdm/model.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ - +import paddle import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -105,18 +105,34 @@ class Model(ModelBase): # sample_nodes 是采样的node_id的结果,包含正负样本 # sample_label 是采样的node_id对应的正负标签 # sample_mask 是为了保持tensor维度一致,padding部分的标签,若为0,则是padding的虚拟node_id - sample_nodes, sample_label, sample_mask = fluid.contrib.layers.tdm_sampler( - x=item_label, - neg_samples_num_list=self.neg_sampling_list, - layer_node_num_list=self.layer_node_num_list, - leaf_node_num=self.leaf_node_nums, - tree_travel_attr=fluid.ParamAttr(name="TDM_Tree_Travel"), - tree_layer_attr=fluid.ParamAttr(name="TDM_Tree_Layer"), - output_positive=self.output_positive, - output_list=True, - seed=0, - tree_dtype='int64', - dtype='int64') + + if self.check_version(): + with fluid.device_guard("cpu"): + sample_nodes, sample_label, sample_mask = fluid.contrib.layers.tdm_sampler( + x=item_label, + neg_samples_num_list=self.neg_sampling_list, + layer_node_num_list=self.layer_node_num_list, + leaf_node_num=self.leaf_node_nums, + tree_travel_attr=fluid.ParamAttr(name="TDM_Tree_Travel"), + tree_layer_attr=fluid.ParamAttr(name="TDM_Tree_Layer"), + output_positive=self.output_positive, + output_list=True, + seed=0, + tree_dtype='int64', + dtype='int64') + else: + sample_nodes, sample_label, sample_mask = fluid.contrib.layers.tdm_sampler( + x=item_label, + neg_samples_num_list=self.neg_sampling_list, + layer_node_num_list=self.layer_node_num_list, + leaf_node_num=self.leaf_node_nums, + tree_travel_attr=fluid.ParamAttr(name="TDM_Tree_Travel"), + tree_layer_attr=fluid.ParamAttr(name="TDM_Tree_Layer"), + output_positive=self.output_positive, + output_list=True, + seed=0, + tree_dtype='int64', + dtype='int64') # 查表得到每个节点的Embedding sample_nodes_emb = [ @@ -479,3 +495,19 @@ class Model(ModelBase): bias_attr=fluid.ParamAttr( name="cls.concat_fc.bias." + str(layer_idx))) return hidden_states_fc + + def check_version(self): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "TDM-GPU need Paddle version 1.8 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.8.0') + return True + except Exception as e: + print(err) + return False diff --git a/models/treebased/tdm/tdm_startup.py b/models/treebased/tdm/tdm_startup.py index 3f1b87db..f82c73d6 100644 --- a/models/treebased/tdm/tdm_startup.py +++ b/models/treebased/tdm/tdm_startup.py @@ -115,11 +115,18 @@ class Startup(StartupBase): res = var.name in special_param return res + if context["fleet_mode"].upper() == "PS": + program = context["model"][model_dict["name"]]["main_program"] + elif context["fleet_mode"].upper() == "COLLECTIVE": + program = context["model"][model_dict["name"]][ + "default_main_program"] + else: + raise ValueError("TDM not support PSLIB") + fluid.io.load_vars( context["exe"], dirname=warmup_model_path, - main_program=context["model"][model_dict["name"]][ - "main_program"], + main_program=program, predicate=is_tdm_tree_var) """ -------- tree file load detail --------- """ -- GitLab