diff --git a/models/contentunderstanding/classification/config.yaml b/models/contentunderstanding/classification/config.yaml index 4cf4d1bb7dae64865e2a2738e80672ac23934702..d1748137f0c4d994b3a566debf43dbdc2c3d66dc 100644 --- a/models/contentunderstanding/classification/config.yaml +++ b/models/contentunderstanding/classification/config.yaml @@ -18,7 +18,7 @@ train: strategy: "async" epochs: 10 - workspace: "paddlerec.models.contentunderstandin.classification" + workspace: "paddlerec.models.contentunderstanding.classification" reader: batch_size: 5 diff --git a/models/contentunderstanding/classification/model.py b/models/contentunderstanding/classification/model.py index 6254199c0fd4ceec48ba7f7d8bee3382d967fd02..e4630820c868af8334fc8edfd2b6c1f4d9e77503 100644 --- a/models/contentunderstanding/classification/model.py +++ b/models/contentunderstanding/classification/model.py @@ -40,9 +40,12 @@ class Model(ModelBase): data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') + + self._data_var = [data, label, seq_len] + # embedding layer emb = fluid.embedding(input=data, size=[self.dict_dim, self.emb_dim]) - emb = fluid.layers.sequence_unpad(emb, length=self.seq_len) + emb = fluid.layers.sequence_unpad(emb, length=seq_len) # convolution layer conv = fluid.nets.sequence_conv_pool( input=emb, @@ -52,7 +55,7 @@ class Model(ModelBase): pool_type="max") # full connect layer - fc_1 = fluid.layers.fc(input=[conv], size=hid_dim) + fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim) # softmax layer prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) @@ -60,18 +63,18 @@ class Model(ModelBase): acc = fluid.layers.accuracy(input=prediction, label=label) self.cost = avg_cost - self.metrics["acc"] = cos_pos + self._metrics["acc"] = acc def get_cost_op(self): return self.cost def get_metrics(self): - return self.metrics + return self._metrics def optimizer(self): learning_rate = 0.01 sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) return sgd_optimizer - def infer_net(self, parameter_list): + def infer_net(self): self.train_net() diff --git a/models/contentunderstanding/classification/reader.py b/models/contentunderstanding/classification/reader.py index 9a93211ed6997c342c412b7c2a043f89838332da..f90097d702df461d226443c32570e46ea3a0b093 100644 --- a/models/contentunderstanding/classification/reader.py +++ b/models/contentunderstanding/classification/reader.py @@ -44,5 +44,9 @@ class TrainReader(Reader): if data is None: yield None return + data = [int(i) for i in data] + label = [int(i) for i in label] + seq_len = [int(i) for i in seq_len] + print >>sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) yield [('data', data), ('label', label), ('seq_len', seq_len)] return data_iter diff --git a/models/contentunderstanding/readme.md b/models/contentunderstanding/readme.md index 417fdde3a8c31a184db7d7dd0f372f769c9615c1..06be7106b287149d24b56773689dad08708a064f 100644 --- a/models/contentunderstanding/readme.md +++ b/models/contentunderstanding/readme.md @@ -71,13 +71,13 @@ python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test ### 训练 ``` -python -m paddlerec.run -m paddlerec.models.rank.dnn -d cpu -e single +python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification -d cpu -e single ``` ### 预测 ``` -python -m paddlerec.run -m paddlerec.models.rank.dnn -d cpu -e single +python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification -d cpu -e single ``` ## 效果对比 diff --git a/models/contentunderstanding/tagspace/config.yaml b/models/contentunderstanding/tagspace/config.yaml index e0232f3faf221be04a6f95a6b0d011d612d0e35c..70333fcbf7edf4b6b5f54145e29cb122ed3ae9c6 100644 --- a/models/contentunderstanding/tagspace/config.yaml +++ b/models/contentunderstanding/tagspace/config.yaml @@ -18,7 +18,7 @@ train: strategy: "async" epochs: 10 - workspace: "paddlerec.models.rank.tagspace" + workspace: "paddlerec.models.contentunderstanding.tagspace" reader: batch_size: 5 diff --git a/readme.md b/readme.md index 3dfbf8d3904d47092052b5d358dd5aeb4ce67b8a..f90176c5de9e90207209ed30a8b9cc53e4a50c4c 100644 --- a/readme.md +++ b/readme.md @@ -1,2 +1,171 @@ -# PaddleRec -推荐算法,大规模并行训练支持 +

+ +

+ +

+
+ Release + License + Slack +
+

+ + +

什么是PaddleRec

+ +

+ +

+ +- 源于飞桨生态的搜索推荐模型**一站式开箱即用工具** +- 适合初学者,开发者,研究者从调研,训练到预测部署的全流程解决方案 +- 包含语义理解、召回、粗排、精排、多任务学习、融合等多个任务的推荐搜索算法库 +- 配置**yaml**自定义选项,即可快速上手使用单机训练、大规模分布式训练、离线预测、在线部署 + + +

PadlleRec概览

+ +

+ +

+ + +

推荐系统-流程概览

+ +

+ +

+ +

便捷安装

+ +### 环境要求 +* Python 2.7/ 3.5 / 3.6 / 3.7 +* PaddlePaddle >= 1.7.2 +* 操作系统: Windows/Mac/Linux + +### 安装命令 + +- 安装方法一: + ```bash + python -m pip install paddle-rec + ``` + +- 安装方法二 + + 源码编译安装 + 1. 安装飞桨 **注:需要用户安装版本 >1.7.2 的飞桨** + + ```shell + python -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + + 2. 源码安装PaddleRec + + ``` + git clone https://github.com/PaddlePaddle/PaddleRec/ + cd PaddleRec + python setup.py install + ``` + + +

快速启动

+ +### 启动内置模型的默认配置 + +目前框架内置了多个模型,简单的命令即可使用内置模型开始单机训练和本地1*1模拟训练,我们以`dnn`为例介绍PaddleRec的简单使用。 + +#### 单机训练 + +```bash +# 使用CPU进行单机训练 +python -m paddlerec.run -m paddlerec.models.rank.dnn -d cpu -e single + +# 使用GPU进行单机训练 +python -m paddlerec.run -m paddlerec.models.rank.dnn -d gpu -e single +``` + +#### 本地模拟分布式训练 + +```bash +# 使用CPU资源进行本地模拟分布式训练 +python -m paddlerec.run -m paddlerec.models.rank.dnn -e local_cluster +``` + +#### 集群分布式训练 + +```bash +# 配置好 mpi/k8s/paddlecloud集群环境后 +python -m paddlerec.run -m paddlerec.models.rank.dnn -e cluster +``` + +### 启动内置模型的自定配置 + +若您复用内置模型,对**yaml**配置文件进行了修改,如更改超参,重新配置数据后,可以直接使用paddlerec运行该yaml文件。 + +我们以dnn模型为例,在paddlerec代码目录下,修改了dnn模型`config.yaml`的配置后,运行`dnn`模型: +```bash +python -m paddlerec.run -m ./models/rank/dnn/config.yaml -e single +``` + + +

支持模型列表

+ + +| 方向 | 模型 | 单机CPU训练 | 单机GPU训练 | 分布式CPU训练 | +| :------: | :----------------------------------------------------------------------------: | :---------: | :---------: | :-----------: | +| 内容理解 | [Text-Classifcation](models/contentunderstanding/classification/model.py) | ✓ | x | ✓ | +| 内容理解 | [TagSpace](models/contentunderstanding/tagspace/model.py) | ✓ | x | ✓ | +| 召回 | [TDM](models/treebased/tdm/model.py) | ✓ | x | ✓ | +| 召回 | [Word2Vec](models/recall/word2vec/model.py) | ✓ | x | ✓ | +| 召回 | [SSR](models/recall/ssr/model.py) | ✓ | ✓ | ✓ | +| 召回 | [Gru4Rec](models/recall/gru4rec/model.py) | ✓ | ✓ | ✓ | +| 排序 | [Dnn](models/rank/dnn/model.py) | ✓ | x | ✓ | +| 排序 | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | +| 排序 | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | +| 排序 | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | +| 排序 | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | +| 多任务 | [ESMM](models/multitask/esmm/model.py) | ✓ | ✓ | ✓ | +| 多任务 | [MMOE](models/multitask/mmoe/model.py) | ✓ | ✓ | ✓ | +| 多任务 | [ShareBottom](models/multitask/share-bottom/model.py) | ✓ | ✓ | ✓ | +| 匹配 | [DSSM](models/match/dssm/model.py) | ✓ | x | ✓ | +| 匹配 | [MultiView-Simnet](models/match/multiview-simnet/model.py) | ✓ | x | ✓ | + + + +

文档

+ +### 背景介绍 +* [推荐系统介绍](doc/rec_background.md) +* [分布式深度学习介绍](doc/ps_background.md) + +### 新手教程 +* [环境要求](#环境要求) +* [安装命令](#安装命令) +* [快速开始](#启动内置模型的默认配置) + +### 进阶教程 +* [自定义数据集及Reader](doc/custom_dataset_reader.md) +* [分布式训练](doc/distributed_train.md) + +### 开发者教程 +* [PaddleRec设计文档](doc/design.md) + +### 关于PaddleRec性能 +* [Benchmark](doc/benchmark.md) + +### FAQ +* [常见问题FAQ](doc/faq.md) + + +

社区

+ +### 反馈 +如有意见、建议及使用中的BUG,欢迎在`GitHub Issue`提交 + +### 版本历史 +- 2020.5.14 - PaddleRec v0.1 + +### 许可证书 +本项目的发布受[Apache 2.0 license](LICENSE)许可认证。 + +>>>>>>> d7171ec5daa477584de89ea7e57a382045e12311