提交 c68f2694 编写于 作者: T tangwei

fix code style

上级 66e1859f
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
......@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer
"""
......@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None, namespace)
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
self.model.train_net()
optimizer = self.model.optimizer()
optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False})
optimizer = fleet.distributed_optimizer(
optimizer, strategy={"use_cvm": False})
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
......@@ -118,16 +121,18 @@ class CtrTrainer(Trainer):
gs = shuf * 0
fleet._role_maker._node_type_comm.Allreduce(shuf, gs)
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(), fleet.worker_num(), gs))
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(
), fleet.worker_num(), gs))
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
context['status'] = 'terminal_pass'
fleet.stop_worker()
......
此差异已折叠。
# PaddleRec Benchmark
> 占位
\ No newline at end of file
> 占位
# PaddleRec 贡献代码
> 占位
\ No newline at end of file
> 占位
......@@ -279,4 +279,4 @@ class Metric(object):
pass
```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
\ No newline at end of file
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
......@@ -7,5 +7,3 @@
### K8S集群运行分布式
> 占位
# 常见问题FAQ
> 占位
\ No newline at end of file
> 占位
# PaddleRec 单机训练
> 占位
\ No newline at end of file
> 占位
......@@ -12,4 +12,3 @@
| 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ |
# PaddleRec 模型调参
> 占位
\ No newline at end of file
> 占位
# PaddleRec 离线预测
\ No newline at end of file
# PaddleRec 离线预测
......@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -37,4 +37,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
......@@ -31,7 +31,8 @@ class Model(ModelBase):
def train_net(self):
""" network definition """
data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64')
data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
......@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer
fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim)
# softmax layer
prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax")
prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from paddlerec.core.reader import Reader
......@@ -38,7 +37,8 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
......@@ -47,4 +47,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -37,4 +37,3 @@
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import io
......
......@@ -26,8 +26,8 @@ from collections import Counter
import os
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator):
if idx == 2 else math.log(1 + float(features[idx])))
for idx in self.cat_idx_:
if features[idx] == '' or features[
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
label_feat_list[idx].append(0)
else:
label_feat_list[idx].append(self.cat_feat_idx_dict_list[
idx - 14][features[idx]])
idx - 14][features[idx]])
label_feat_list[0].append(int(features[0]))
return label_feat_list
......@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, absolute_import, division
import os
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -19,8 +19,9 @@ try:
import cPickle as pickle
except ImportError:
import pickle
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self.categorical_range_ = range(14, 40)
# load preprocessed feature dict
self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
......@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy
from collections import Counter
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import pickle
import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
......@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
......@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
feature_name.append("label")
s = "click:" + str(label[0])
for i in dense_feature:
s += " dense_feature:" + str(i)
s += " dense_feature:" + str(i)
for i in range(1, 1 + len(categorical_range_)):
s += " " + str(i) + ":" + str(sparse_feature[i-1][0])
s += " " + str(i) + ":" + str(sparse_feature[i - 1][0])
print s.strip()
yield None
return reader
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import io
import args
import pandas as pd
from sklearn import preprocessing
from sklearn import preprocessing
def _clean_file(source_path,target_path):
def _clean_file(source_path, target_path):
"""makes changes to match the CSV format."""
with io.open(source_path, 'r') as temp_eval_file:
with io.open(target_path, 'w') as eval_file:
......@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path):
line = line[:-1]
line += '\n'
eval_file.write(line)
def build_model_columns(train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
# Load the dataset in Pandas
......@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
categorical_columns = ['education','marital_status','relationship','workclass','occupation']
categorical_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation'
]
for col in categorical_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False)
base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets']
train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
crossed_columns = ['education_occupation','age_buckets_education_occupation']
test_df[col] = label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(),
bins,
labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(),
bins,
labels=False)
base_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation', 'age_buckets'
]
train_df['education_occupation'] = train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df[
'age_buckets'].astype(str) + '_' + train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df[
'age_buckets'].astype(str) + '_' + test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
crossed_columns = [
'education_occupation', 'age_buckets_education_occupation'
]
for col in crossed_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
test_df[col] = label_test.fit_transform(test_df[col])
wide_columns = base_columns + crossed_columns
train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns)
test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns)
train_df_temp = pd.get_dummies(
train_df[categorical_columns], columns=categorical_columns)
test_df_temp = pd.get_dummies(
test_df[categorical_columns], columns=categorical_columns)
train_df = train_df.join(train_df_temp)
test_df = test_df.join(test_df_temp)
deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week']
train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
deep_columns = list(train_df_temp.columns) + [
'age', 'education_num', 'capital_gain', 'capital_loss',
'hours_per_week'
]
train_df['label'] = train_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
with io.open('test_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
with io.open('test_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False)
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
train_data_path, index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
test_data_path, index=False)
def clean_file(train_path, test_path, train_data_path, test_data_path):
_clean_file(train_path, train_data_path)
_clean_file(test_path, test_data_path)
if __name__ == '__main__':
args = args.parse_args()
clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path)
clean_file(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
build_model_columns(args.train_data_path, args.test_data_path)
......@@ -20,6 +20,7 @@ except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
wide_feat, deep_deat, label = self._process_line(line)
s = ""
for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]:
for i in [('wide_input', wide_feat), ('deep_input', deep_deat),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
......@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -21,6 +21,7 @@ except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
......@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
......@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir test_data
mv diginetica/test.txt test_data
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import pickle
......@@ -10,6 +24,7 @@ parser.add_argument(
help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample')
opt = parser.parse_args()
def process_data(file_type):
path = os.path.join(opt.data_dir, file_type)
output_path = os.path.splitext(path)[0] + ".txt"
......@@ -23,6 +38,7 @@ def process_data(file_type):
fout.write(str(data[i][1]))
fout.write("\n")
process_data("train")
process_data("test")
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
import sys
import time
......
......@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
| MOVIELENS | NCF | 0.688 | -- |
| -- | Youtube | -- | -- |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
......@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir test_data/
rm -rf raw_data
......@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效
- 如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。
- 训练数据如何组织?答:tdm的训练数据主要为:`user/query emb``item`的正样本,`item`需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的`tdm-sampler op`完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。
- 大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。
3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
\ No newline at end of file
3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
1,2
3,4,5,6
7,8,9,10,11,12,13
14,15,16,17,18,19,20,21,22,23,24,25
\ No newline at end of file
14,15,16,17,18,19,20,21,22,23,24,25
......@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
engines = {}
device = ["CPU", "GPU"]
clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"]
engine_choices = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER",
"TDM_SINGLE", "TDM_LOCAL_CLUSTER", "TDM_CLUSTER"]
engine_choices = [
"SINGLE", "LOCAL_CLUSTER", "CLUSTER", "TDM_SINGLE", "TDM_LOCAL_CLUSTER",
"TDM_CLUSTER"
]
custom_model = ['TDM']
model_name = ""
......@@ -66,7 +68,8 @@ def get_engine(args):
engine = engine.upper()
if engine not in engine_choices:
raise ValueError("train.engin can not be chosen in {}".format(engine_choices))
raise ValueError("train.engin can not be chosen in {}".format(
engine_choices))
print("engines: \n{}".format(engines))
......@@ -77,8 +80,10 @@ def get_engine(args):
def get_transpiler():
FNULL = open(os.devnull, 'w')
cmd = ["python", "-c",
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"]
cmd = [
"python", "-c",
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
proc = subprocess.Popen(cmd, stdout=FNULL, stderr=FNULL, cwd=os.getcwd())
ret = proc.wait()
if ret == -11:
......@@ -152,7 +157,8 @@ def cluster_engine(args):
update_workspace(flattens)
envs.set_runtime_environs(flattens)
print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value")))
print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"
)))
launch = ClusterEngine(None, args.model)
return launch
......@@ -163,7 +169,8 @@ def cluster_engine(args):
cluster_envs = {}
cluster_envs["train.trainer.trainer"] = trainer
cluster_envs["train.trainer.engine"] = "cluster"
cluster_envs["train.trainer.threads"] = envs.get_runtime_environ("CPU_NUM")
cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
"CPU_NUM")
cluster_envs["train.trainer.platform"] = envs.get_platform()
print("launch {} engine with cluster to with model: {}".format(
trainer, args.model))
......@@ -181,7 +188,8 @@ def cluster_engine(args):
def cluster_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model))
print("launch cluster engine with cluster to run model: {}".format(
args.model))
cluster_envs = {}
cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer"
......@@ -209,7 +217,8 @@ def local_cluster_engine(args):
cluster_envs["train.trainer.platform"] = envs.get_platform()
cluster_envs["CPU_NUM"] = "2"
print("launch {} engine with cluster to run model: {}".format(trainer, args.model))
print("launch {} engine with cluster to run model: {}".format(trainer,
args.model))
set_runtime_envs(cluster_envs, args.model)
launch = LocalClusterEngine(cluster_envs, args.model)
......@@ -217,10 +226,12 @@ def local_cluster_engine(args):
def local_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model))
print("launch cluster engine with cluster to run model: {}".format(
args.model))
from paddlerec.core.engine.local_mpi import LocalMPIEngine
print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(args.model))
print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(
args.model))
mpi = util.run_which("mpirun")
if not mpi:
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
setup for paddle-rec.
"""
......@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
import shutil
import tempfile
requires = [
"paddlepaddle == 1.7.2",
"pyyaml >= 5.1.1"
]
requires = ["paddlepaddle == 1.7.2", "pyyaml >= 5.1.1"]
about = {}
about["__title__"] = "paddle-rec"
......@@ -48,18 +43,27 @@ def build(dirname):
package_dir = os.path.dirname(os.path.abspath(__file__))
run_cmd("cp -r {}/* {}".format(package_dir, dirname))
run_cmd("mkdir {}".format(os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec")))
packages = find_packages(dirname, include=('paddlerec.*'))
package_dir = {'': dirname}
package_data = {}
models_copy = ['data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'tree/*.txt']
models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt'
]
engine_copy = ['*/*.sh']
for package in packages:
if package.startswith("paddlerec.models."):
......@@ -80,8 +84,7 @@ def build(dirname):
package_data=package_data,
python_requires=">=2.7",
install_requires=requires,
zip_safe=False
)
zip_safe=False)
dirname = tempfile.mkdtemp()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册