提交 c782ddb7 编写于 作者: Z zhangwenhui03

Merge branch 'develop' into 'develop'

# Conflicts:
#   models/rank/din/reader.py
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################
# Usage: submit.sh
......
......@@ -15,10 +15,9 @@
from __future__ import print_function
from __future__ import unicode_literals
import subprocess
import sys
import os
import copy
import os
import subprocess
from paddlerec.core.engine.engine import Engine
from paddlerec.core.factory import TrainerFactory
......
......@@ -29,4 +29,3 @@ class Engine:
@abc.abstractmethod
def run(self):
pass
......@@ -14,10 +14,11 @@
from __future__ import print_function
from __future__ import unicode_literals
import subprocess
import sys
import os
import copy
import os
import sys
import subprocess
from paddlerec.core.engine.engine import Engine
from paddlerec.core.utils import envs
......
......@@ -14,10 +14,11 @@
from __future__ import print_function
from __future__ import unicode_literals
import subprocess
import sys
import os
import copy
import os
import sys
import subprocess
from paddlerec.core.engine.engine import Engine
......
......@@ -25,17 +25,8 @@ class Layer(object):
"""
pass
def generate(self, mode, param):
"""R
"""
if mode == 'fluid':
return self.generate_fluid(param)
print('unsupport this mode: ' + mode)
return None, None
@abc.abstractmethod
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
pass
......@@ -53,7 +53,7 @@ class Metric(object):
pass
@abc.abstractmethod
def get_result_to_string(self):
def __str__(self):
"""
Return:
result(string) : calculate result with string format, for output
......
......@@ -13,8 +13,10 @@
# limitations under the License.
import math
import numpy as np
import paddle.fluid as fluid
from paddlerec.core.metric import Metric
......@@ -198,7 +200,7 @@ class AUCMetric(Metric):
""" """
return self._result
def get_result_to_string(self):
def __str__(self):
""" """
result = self.get_result()
result_str = "%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f " \
......
......@@ -47,7 +47,7 @@ class Model(object):
def get_infer_results(self):
return self._infer_results
def get_cost_op(self):
def get_avg_cost(self):
"""R
"""
return self._cost
......
......@@ -12,10 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import copy
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
import yaml
from paddlerec.core.model import Model
from paddlerec.core.utils import table
......
......@@ -13,10 +13,11 @@
# limitations under the License.
import paddle.fluid as fluid
from paddlerec.core.layer import Layer
class EmbeddingInputLayer(Layer):
class EmbeddingFuseLayer(Layer):
"""R
"""
......@@ -31,7 +32,7 @@ class EmbeddingInputLayer(Layer):
self._emb_dim = self._mf_dim + 3 # append show ctr lr
self._emb_layers = []
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
show_clk = fluid.layers.concat(
......@@ -63,7 +64,7 @@ class LabelInputLayer(Layer):
self._data_type = config.get('data_type', "int64")
self._label_idx = config['label_idx']
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
label = fluid.layers.data(name=self._name, shape=[-1, self._dim], \
......@@ -85,7 +86,7 @@ class TagInputLayer(Layer):
self._dim = config.get('dim', 1)
self._data_type = config['data_type']
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
output = fluid.layers.data(name=self._name, shape=[-1, self._dim], \
......@@ -107,7 +108,7 @@ class ParamLayer(Layer):
self._data_type = config.get('data_type', 'float32')
self._config = config
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}}
......@@ -125,7 +126,7 @@ class SummaryLayer(Layer):
self._data_type = config.get('data_type', 'float32')
self._config = config
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}}
......@@ -143,7 +144,7 @@ class NormalizetionLayer(Layer):
self._summary = config['summary']
self._table_id = config.get('table_id', -1)
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
input_layer = param['layer'][self._input[0]]
......@@ -158,7 +159,7 @@ class NormalizetionLayer(Layer):
'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}}
class NeuralLayer(Layer):
class FCLayer(Layer):
"""R
"""
......@@ -171,7 +172,7 @@ class NeuralLayer(Layer):
self._bias = config.get('bias', True)
self._act_func = config.get('act_func', None)
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
param_layer = param['layer'][self._param]
......@@ -199,7 +200,7 @@ class NeuralLayer(Layer):
'table_id': param_layer.get('table_id', -1)}}
class SigmoidLossLayer(Layer):
class LogLossLayer(Layer):
"""R
"""
......@@ -230,7 +231,7 @@ class SigmoidLossLayer(Layer):
}
}
def generate_fluid(self, param):
def generate(self, param):
"""R
"""
input_layer = param['layer'][self._input[0]]
......
......@@ -12,14 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import abc
import os
import time
import sys
import yaml
from paddle import fluid
from paddlerec.core.utils import envs
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
↗ (single/cluster) CtrTrainer
Trainer
↗ (for single training) SingleTrainer/TDMSingleTrainer
↘ TranspilerTrainer → (for cluster training) ClusterTrainer/TDMClusterTrainer
↘ (for online learning training) OnlineLearningTrainer
"""
......@@ -25,7 +25,6 @@ import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddlerec.core.utils import envs
from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer
......@@ -83,7 +82,7 @@ class ClusterTrainer(TranspileTrainer):
strategy = self.build_strategy()
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(self.model.get_cost_op())
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
context['status'] = 'server_pass'
......@@ -115,7 +114,7 @@ class ClusterTrainer(TranspileTrainer):
program = fluid.compiler.CompiledProgram(
fleet.main_program).with_data_parallel(
loss_name=self.model.get_cost_op().name,
loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy())
......
......@@ -11,9 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
......@@ -22,7 +23,7 @@ from paddlerec.core.utils import envs
from paddlerec.core.trainer import Trainer
class CtrPaddleTrainer(Trainer):
class CtrTrainer(Trainer):
"""R
"""
......@@ -87,7 +88,7 @@ class CtrPaddleTrainer(Trainer):
optimizer = self.model.optimizer()
optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False})
optimizer.minimize(self.model.get_cost_op())
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
context['status'] = 'server_pass'
......
......@@ -13,12 +13,12 @@
# limitations under the License.
import datetime
import json
import sys
import time
import json
import datetime
import numpy as np
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
......@@ -72,7 +72,7 @@ def worker_numric_max(value, env="mpi"):
return wroker_numric_opt(value, env, "max")
class CtrPaddleTrainer(Trainer):
class CtrTrainer(Trainer):
"""R
"""
......@@ -129,7 +129,7 @@ class CtrPaddleTrainer(Trainer):
model = self._exector_context[executor['name']]['model']
self._metrics.update(model.get_metrics())
runnnable_scope.append(scope)
runnnable_cost_op.append(model.get_cost_op())
runnnable_cost_op.append(model.get_avg_cost())
for var in model._data_var:
if var.name in data_var_name_dict:
continue
......@@ -146,7 +146,7 @@ class CtrPaddleTrainer(Trainer):
model = self._exector_context[executor['name']]['model']
program = model._build_param['model']['train_program']
if not executor['is_update_sparse']:
program._fleet_opt["program_configs"][str(id(model.get_cost_op().block.program))]["push_sparse"] = []
program._fleet_opt["program_configs"][str(id(model.get_avg_cost().block.program))]["push_sparse"] = []
if 'train_thread_num' not in executor:
executor['train_thread_num'] = self.global_config['train_thread_num']
with fluid.scope_guard(scope):
......
......@@ -18,9 +18,9 @@ Training use fluid with one node only.
from __future__ import print_function
import datetime
import os
import time
import datetime
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
......@@ -31,7 +31,7 @@ from paddlerec.core.utils import envs
from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer
class ClusterTrainer(TranspileTrainer):
class OnlineLearningTrainer(TranspileTrainer):
def processor_register(self):
role = PaddleCloudRoleMaker()
fleet.init(role)
......@@ -78,7 +78,7 @@ class ClusterTrainer(TranspileTrainer):
optimizer = self.model.optimizer()
strategy = self.build_strategy()
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(self.model.get_cost_op())
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
context['status'] = 'server_pass'
......
......@@ -17,14 +17,14 @@ Training use fluid with one node only.
"""
from __future__ import print_function
import logging
import time
import logging
import paddle.fluid as fluid
from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer
from paddlerec.core.utils import envs
import numpy as np
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
......@@ -36,7 +36,8 @@ class SingleTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None,
"train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor('train_pass', self.dataloader_train)
......@@ -47,7 +48,7 @@ class SingleTrainer(TranspileTrainer):
def init(self, context):
self.model.train_net()
optimizer = self.model.optimizer()
optimizer.minimize((self.model.get_cost_op()))
optimizer.minimize((self.model.get_avg_cost()))
self.fetch_vars = []
self.fetch_alias = []
......@@ -74,7 +75,7 @@ class SingleTrainer(TranspileTrainer):
program = fluid.compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=self.model.get_cost_op().name)
loss_name=self.model.get_avg_cost().name)
metrics_varnames = []
metrics_format = []
......@@ -122,8 +123,8 @@ class SingleTrainer(TranspileTrainer):
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times))
self.save(i, "train", is_fleet=False)
context['status'] = 'infer_pass'
......
......@@ -17,17 +17,16 @@ Training use fluid with one node only.
"""
from __future__ import print_function
import logging
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker
from paddlerec.core.utils import envs
from paddlerec.core.trainers.cluster_trainer import ClusterTrainer
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
......
......@@ -18,12 +18,11 @@ Training use fluid with one node only.
from __future__ import print_function
import logging
import paddle.fluid as fluid
from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer
import numpy as np
import paddle.fluid as fluid
from paddlerec.core.trainers.single_trainer import SingleTrainer
from paddlerec.core.utils import envs
import numpy as np
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
......
......@@ -14,7 +14,6 @@
from __future__ import print_function
import os
import sys
from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.utils.envs import get_global_env
......
......@@ -13,8 +13,8 @@
# limitations under the License.
import abc
import time
import datetime
import time
import paddle.fluid as fluid
......@@ -22,7 +22,7 @@ from paddlerec.core.utils import fs as fs
from paddlerec.core.utils import util as util
class Dataset(object):
class DatasetHolder(object):
"""
Dataset Base
"""
......@@ -62,7 +62,7 @@ class Dataset(object):
pass
class TimeSplitDataset(Dataset):
class TimeSplitDatasetHolder(DatasetHolder):
"""
Dataset with time split dir. root_path/$DAY/$HOUR
"""
......@@ -142,16 +142,6 @@ class TimeSplitDataset(Dataset):
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
return data_file_list
class FluidTimeSplitDataset(TimeSplitDataset):
"""
A Dataset with time split for PaddleFluid
"""
def __init__(self, config):
""" """
TimeSplitDataset.__init__(self, config)
def _alloc_dataset(self, file_list):
""" """
dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type'])
......
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
......
......@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from contextlib import closing
import copy
import sys
import os
import socket
from contextlib import closing
import sys
global_envs = {}
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import os
from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
......@@ -94,11 +95,12 @@ class FileHandler(object):
"""
A Smart file handler. auto judge local/afs by path
"""
def __init__(self, config):
"""R
"""
if 'fs_name' in config:
hadoop_home="$HADOOP_HOME"
hadoop_home = "$HADOOP_HOME"
hdfs_configs = {
"hadoop.job.ugi": config['fs_ugi'],
"fs.default.name": config['fs_name']
......@@ -131,7 +133,8 @@ class FileHandler(object):
if mode.find('a') >= 0:
org_content = self._hdfs_client.cat(dest_path)
content = content + org_content
self._local_fs_client.write(content, temp_local_file, mode) #fleet hdfs_client only support upload, so write tmp file
self._local_fs_client.write(content, temp_local_file,
mode) # fleet hdfs_client only support upload, so write tmp file
self._hdfs_client.delete(dest_path + ".tmp")
self._hdfs_client.upload(dest_path + ".tmp", temp_local_file)
self._hdfs_client.delete(dest_path + ".bak")
......
......@@ -12,9 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import yaml
class TableMeta(object):
"""
......
......@@ -12,11 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import os
import time
import datetime
from paddle import fluid
from paddlerec.core.utils import fs as fs
......
......@@ -153,7 +153,7 @@ class Model(object):
def infer_net(self):
pass
def get_cost_op(self):
def get_avg_cost(self):
return self._cost
```
......
......@@ -13,15 +13,9 @@
# limitations under the License.
import paddle.fluid as fluid
import math
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
class Model(ModelBase):
def __init__(self, config):
......@@ -65,7 +59,7 @@ class Model(ModelBase):
self.cost = avg_cost
self._metrics["acc"] = acc
def get_cost_op(self):
def get_avg_cost(self):
return self.cost
def get_metrics(self):
......
......@@ -12,20 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import sys
import collections
import os
import six
import time
import numpy as np
import paddle.fluid as fluid
import paddle
import csv
import io
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
......@@ -47,6 +38,7 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >>sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -12,30 +12,27 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import math
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
from paddlerec.core.model import Model as ModelBase
from paddlerec.core.utils import envs
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
self.cost = None
self.metrics = {}
self.vocab_text_size = 11447#envs.get_global_env("vocab_text_size", None, self._namespace)
self.vocab_tag_size = 4#envs.get_global_env("vocab_tag_size", None, self._namespace)
self.emb_dim = 10#envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = 1000#envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = 5#envs.get_global_env("win_size", None, self._namespace)
self.margin = 0.1#envs.get_global_env("margin", None, self._namespace)
self.neg_size = 3#envs.get_global_env("neg_size", None, self._namespace)
print self.emb_dim
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace)
self.margin = envs.get_global_env("margin", None, self._namespace)
self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
def train_net(self):
""" network definition """
......@@ -92,18 +89,16 @@ class Model(ModelBase):
self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos
def get_cost_op(self):
def get_avg_cost(self):
return self.cost
def get_metrics(self):
return self.metrics
def optimizer(self):
learning_rate = 0.01#envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
#sgd_optimizer.minimize(avg_cost)
return sgd_optimizer
def infer_net(self, parameter_list):
self.train_net()
......@@ -12,20 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import sys
import collections
import os
import six
import time
import numpy as np
import paddle.fluid as fluid
import paddle
import csv
import io
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
......@@ -54,9 +47,6 @@ class TrainReader(Reader):
neg_index = rand_i
neg_tag.append(neg_index)
sum_n += 1
# if n > 0 and len(text) > n:
# #yield None
# return None, None, None
return text, pos_tag, neg_tag
def generate_sample(self, line):
......@@ -66,4 +56,5 @@ class TrainReader(Reader):
yield None
return
yield [('text', text), ('pos_tag', pos_tag), ('neg_tag', neg_tag)]
return data_iter
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
......@@ -29,7 +28,8 @@ class Model(ModelBase):
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i in range(Neg)]
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i
in range(Neg)]
self._data_var.append(self.query)
self._data_var.append(self.doc_pos)
for input in self.doc_negs:
......@@ -39,7 +39,6 @@ class Model(ModelBase):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace)
......@@ -47,7 +46,7 @@ class Model(ModelBase):
def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data]
for i in range(len(hidden_layers)):
xavier=fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i])
xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i])
out = fluid.layers.fc(input=fc_inputs[-1],
size=hidden_layers[i],
act=hidden_acts[i],
......@@ -66,12 +65,13 @@ class Model(ModelBase):
R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, ['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)])
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts,
['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(prob, axes=[0,1], starts=[0,0], ends=[4, 1])
hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss)
......
......@@ -14,7 +14,6 @@
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class EvaluateReader(Reader):
......
......@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
......@@ -37,7 +37,7 @@ class TrainReader(Reader):
neg_docs = []
for i in range(len(features) - 2):
feature_names.append('doc_neg_' + str(i))
neg_docs.append(map(float, features[i+2].split(',')))
neg_docs.append(map(float, features[i + 2].split(',')))
yield zip(feature_names, [query] + [pos_doc] + neg_docs)
......
#! /bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
echo "begin to prepare data"
......
......@@ -11,10 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import io
import copy
import random
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
......@@ -54,4 +51,5 @@ class EvaluateReader(Reader):
else:
output[index][1].append(padding)
yield output
return data_iter
......@@ -14,10 +14,12 @@
import random
class Dataset:
def __init__(self):
pass
class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000):
# ids are randomly generated
......@@ -50,7 +52,8 @@ class SyntheticDataset(Dataset):
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in nt_slot]
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in
nt_slot]
neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots
else:
......@@ -67,6 +70,7 @@ class SyntheticDataset(Dataset):
def test(self):
return self._reader_creator(False)
if __name__ == '__main__':
sparse_feature_dim = 1000001
query_slots = 1
......
......@@ -12,16 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class BowEncoder(object):
""" bow-encoder """
......@@ -97,6 +95,7 @@ class SimpleEncoderFactory(object):
rnn_encode = GrnnEncoder(hidden_size=enc_hid_size)
return rnn_encode
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
......@@ -143,7 +142,8 @@ class Model(ModelBase):
self.nt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, dtype='int64')
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1,
dtype='int64')
for i in range(len(self.title_encoders))
]
......
......@@ -11,10 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import io
import copy
import random
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
......@@ -57,4 +54,5 @@ class TrainReader(Reader):
else:
output[index][1].append(padding)
yield output
return data_iter
......@@ -13,19 +13,19 @@
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
from collections import defaultdict
import numpy as np
from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129',
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
self.all_field_id_dict = defaultdict(int)
for i,field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False,i]
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
def generate_sample(self, line):
"""
......@@ -41,10 +41,10 @@ class EvaluateReader(Reader):
cvr = int(features[2])
padding = 0
output = [(field_id,[]) for field_id in self.all_field_id_dict]
output = [(field_id, []) for field_id in self.all_field_id_dict]
for elem in features[4:]:
field_id,feat_id = elem.strip().split(':')
field_id, feat_id = elem.strip().split(':')
if field_id not in self.all_field_id_dict:
continue
self.all_field_id_dict[field_id][0] = True
......@@ -52,7 +52,7 @@ class EvaluateReader(Reader):
output[index][1].append(int(feat_id))
for field_id in self.all_field_id_dict:
visited,index = self.all_field_id_dict[field_id]
visited, index = self.all_field_id_dict[field_id]
if visited:
self.all_field_id_dict[field_id][0] = False
else:
......@@ -60,4 +60,5 @@ class EvaluateReader(Reader):
output.append(('ctr', [ctr]))
output.append(('cvr', [cvr]))
yield output
return reader
......@@ -11,21 +11,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
from collections import defaultdict
import numpy as np
from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129',
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
self.all_field_id_dict = defaultdict(int)
for i,field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False,i]
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
def generate_sample(self, line):
"""
......@@ -37,25 +38,25 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features = line.strip().split(',')
#ctr = list(map(int, features[1]))
#cvr = list(map(int, features[2]))
# ctr = list(map(int, features[1]))
# cvr = list(map(int, features[2]))
ctr = int(features[1])
cvr = int(features[2])
padding = 0
output = [(field_id,[]) for field_id in self.all_field_id_dict]
output = [(field_id, []) for field_id in self.all_field_id_dict]
for elem in features[4:]:
field_id,feat_id = elem.strip().split(':')
field_id, feat_id = elem.strip().split(':')
if field_id not in self.all_field_id_dict:
continue
self.all_field_id_dict[field_id][0] = True
index = self.all_field_id_dict[field_id][1]
#feat_id = list(map(int, feat_id))
# feat_id = list(map(int, feat_id))
output[index][1].append(int(feat_id))
for field_id in self.all_field_id_dict:
visited,index = self.all_field_id_dict[field_id]
visited, index = self.all_field_id_dict[field_id]
if visited:
self.all_field_id_dict[field_id][0] = False
else:
......@@ -63,4 +64,5 @@ class TrainReader(Reader):
output.append(('ctr', [ctr]))
output.append(('cvr', [cvr]))
yield output
return reader
......@@ -12,25 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
import numpy as np
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def fc(self,tag, data, out_dim, active='prelu'):
def fc(self, tag, data, out_dim, active='prelu'):
init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1])
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales))
initializer=fluid.initializer.NormalInitializer(loc=0.0,
scale=init_stddev * scales))
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
......@@ -38,13 +38,13 @@ class Model(ModelBase):
size=out_dim,
act=active,
param_attr=p_attr,
bias_attr =b_attr,
bias_attr=b_attr,
name=tag)
return out
def input_data(self):
sparse_input_ids = [
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0,23)
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23)
]
label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64")
label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64")
......@@ -63,10 +63,11 @@ class Model(ModelBase):
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size)
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)
),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum')
field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum')
emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1)
......@@ -79,7 +80,7 @@ class Model(ModelBase):
# cvr
cvr_fc1 = self.fc('cvr_fc1', concat_emb, 200, active)
cvr_fc2 = self.fc('cvr_fc2', cvr_fc1, 80, active)
cvr_out = self.fc('cvr_out', cvr_fc2, 2,'softmax')
cvr_out = self.fc('cvr_out', cvr_fc2, 2, 'softmax')
ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1]
......@@ -88,7 +89,7 @@ class Model(ModelBase):
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1)
ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
......@@ -98,25 +99,21 @@ class Model(ModelBase):
self._infer_results["AUC_ctcvr"] = auc_ctcvr
return
loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost
self._metrics["AUC_ctr"] = auc_ctr
self._metrics["BATCH_AUC_ctr"] = batch_auc_ctr
self._metrics["AUC_ctcvr"] = auc_ctcvr
self._metrics["BATCH_AUC_ctcvr"] = batch_auc_ctcvr
def train_net(self):
input_data = self.input_data()
self.net(input_data)
def infer_net(self):
self._infer_data_var = self.input_data()
self._infer_data_loader = fluid.io.DataLoader.from_generator(
......
......@@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import numpy as np
class EvaluateReader(Reader):
......
......@@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import numpy as np
class TrainReader(Reader):
......@@ -44,8 +43,8 @@ class TrainReader(Reader):
label_marital = [1, 0]
elif int(l[0]) == 1:
label_marital = [0, 1]
#label_income = np.array(label_income)
#label_marital = np.array(label_marital)
# label_income = np.array(label_income)
# label_marital = np.array(label_marital)
feature_name = ["input", "label_income", "label_marital"]
yield zip(feature_name, [data] + [label_income] + [label_marital])
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
......@@ -50,8 +49,7 @@ class Model(ModelBase):
name='expert_' + str(i))
expert_outputs.append(expert_output)
expert_concat = fluid.layers.concat(expert_outputs, axis=1)
expert_concat = fluid.layers.reshape(expert_concat,[-1, expert_num, expert_size])
expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers = []
......@@ -79,19 +77,22 @@ class Model(ModelBase):
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64'))
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True)
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
avg_cost_income = fluid.layers.mean(x=cost_income)
avg_cost_marital = fluid.layers.mean(x=cost_marital)
......@@ -104,10 +105,8 @@ class Model(ModelBase):
self._metrics["AUC_marital"] = auc_marital
self._metrics["BATCH_AUC_marital"] = batch_auc_2
def train_net(self):
self.MMOE()
def infer_net(self):
self.MMOE(is_infer=True)
......@@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import numpy as np
class EvaluateReader(Reader):
......
......@@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import numpy as np
class TrainReader(Reader):
......@@ -44,8 +43,8 @@ class TrainReader(Reader):
label_marital = [1, 0]
elif int(l[0]) == 1:
label_marital = [0, 1]
#label_income = np.array(label_income)
#label_marital = np.array(label_marital)
# label_income = np.array(label_income)
# label_marital = np.array(label_marital)
feature_name = ["input", "label_income", "label_marital"]
yield zip(feature_name, [data] + [label_income] + [label_marital])
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
......@@ -47,7 +46,6 @@ class Model(ModelBase):
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
# Build tower layer from bottom layer
output_layers = []
for index in range(tower_nums):
......@@ -61,23 +59,26 @@ class Model(ModelBase):
name='output_layer_' + str(index))
output_layers.append(output_layer)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64'))
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True)
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1)
avg_cost = fluid.layers.mean(x=cost)
......@@ -88,10 +89,8 @@ class Model(ModelBase):
self._metrics["AUC_marital"] = auc_marital
self._metrics["BATCH_AUC_marital"] = batch_auc_2
def train_net(self):
self.model()
def infer_net(self):
self.model(is_infer=True)
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
......
......@@ -11,18 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import math
import sys
import os
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
from collections import Counter
import os
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
......@@ -83,6 +84,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
label_feat_list = self._process_line(line)
yield list(zip(self.label_feat_names, label_feat_list))
......
......@@ -12,12 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
import paddle.fluid as fluid
import math
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
from collections import OrderedDict
class Model(ModelBase):
def __init__(self, config):
......@@ -141,7 +142,6 @@ class Model(ModelBase):
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
# logloss
logloss = fluid.layers.log_loss(self.prob, self.target_input)
self.avg_logloss = fluid.layers.reduce_mean(logloss)
......
......@@ -11,15 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
......@@ -64,6 +67,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]
......
......@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
......@@ -32,10 +33,11 @@ class Model(ModelBase):
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field], dtype='int64') # None * num_field(defalut:39)
raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field],
dtype='int64') # None * num_field(defalut:39)
raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32') # None * num_field
self.label = fluid.data(name='label', shape=[None, 1], dtype='float32') # None * 1
feat_idx = fluid.layers.reshape(raw_feat_idx,[-1, 1]) # (None * num_field) * 1
feat_idx = fluid.layers.reshape(raw_feat_idx, [-1, 1]) # (None * num_field) * 1
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
# ------------------------- set _data_var --------------------------
......@@ -47,7 +49,7 @@ class Model(ModelBase):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
#------------------------- first order term --------------------------
# ------------------------- first order term --------------------------
reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace)
first_weights_re = fluid.embedding(
......@@ -65,7 +67,7 @@ class Model(ModelBase):
first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1)
#------------------------- second order term --------------------------
# ------------------------- second order term --------------------------
feat_embeddings_re = fluid.embedding(
input=feat_idx,
......@@ -99,8 +101,7 @@ class Model(ModelBase):
summed_features_emb_square - squared_sum_features_emb, 1,
keep_dim=True) # None * 1
#------------------------- DNN --------------------------
# ------------------------- DNN --------------------------
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
......@@ -128,21 +129,21 @@ class Model(ModelBase):
initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_)))
#------------------------- DeepFM --------------------------
# ------------------------- DeepFM --------------------------
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn)
def train_net(self):
self.deepfm_net()
#------------------------- Cost(logloss) --------------------------
# ------------------------- Cost(logloss) --------------------------
cost = fluid.layers.log_loss(input=self.predict, label=self.label)
avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost
#------------------------- Metric(Auc) --------------------------
# ------------------------- Metric(Auc) --------------------------
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(self.label, 'int64')
......
......@@ -13,7 +13,6 @@
# limitations under the License.
import paddle.fluid as fluid
import math
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
......
......@@ -13,16 +13,20 @@
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import numpy as np
import os
import random
try:
import cPickle as pickle
except ImportError:
import pickle
import numpy as np
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader")
......@@ -43,9 +47,6 @@ class TrainReader(Reader):
self.batch_size = envs.get_global_env("batch_size", 32, "train.reader")
self.group_size = self.batch_size * 20
def _process_line(self, line):
line = line.strip().split(';')
hist = line[0].split()
......@@ -54,13 +55,13 @@ class TrainReader(Reader):
cate = [int(i) for i in cate]
return [hist, cate, [int(line[2])], [int(line[3])], [float(line[4])]]
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
#feat_idx, feat_value, label = self._process_line(line)
# feat_idx, feat_value, label = self._process_line(line)
yield self._process_line(line)
return data_iter
......@@ -127,5 +128,3 @@ class TrainReader(Reader):
data_set = self.base_read(files)
random.shuffle(data_set)
return self.batch_reader(data_set, self.batch_size, self.batch_size * 20)
......@@ -13,6 +13,7 @@
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
......
......@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
......@@ -26,8 +27,12 @@ class Model(ModelBase):
def wide_part(self, data):
out = fluid.layers.fc(input=data,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)),
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
return out
......@@ -35,7 +40,10 @@ class Model(ModelBase):
def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1]))),
act=active,
name=tag)
......@@ -64,13 +72,15 @@ class Model(ModelBase):
wide_model = fluid.layers.fc(input=wide_output,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(input=deep_output,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_deep')
......
......@@ -11,15 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
pass
......@@ -28,7 +30,7 @@ class TrainReader(Reader):
line = line.strip().split(',')
features = list(map(float, line))
wide_feat = features[0:8]
deep_feat = features[8:58+8]
deep_feat = features[8:58 + 8]
label = features[-1]
return wide_feat, deep_feat, [label]
......@@ -36,6 +38,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
wide_feat, deep_deat, label = self._process_line(line)
yield [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]
......
......@@ -11,15 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
pass
......
......@@ -13,7 +13,6 @@
# limitations under the License.
import paddle.fluid as fluid
import math
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
......
#! /bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
echo "begin to download data"
......
......@@ -11,10 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import io
import copy
import random
import numpy as np
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
......@@ -120,8 +122,9 @@ class EvaluateReader(Reader):
else:
# Due to fixed batch_size, discard the remaining ins
return
#cur_batch = remain_data[i:]
#yield self.make_data(cur_batch, group_remain % batch_size)
# cur_batch = remain_data[i:]
# yield self.make_data(cur_batch, group_remain % batch_size)
return _reader
def generate_batch_from_trainfiles(self, files):
......@@ -132,4 +135,5 @@ class EvaluateReader(Reader):
def generate_sample(self, line):
def data_iter():
yield []
return data_iter
......@@ -12,8 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
......@@ -28,13 +29,13 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
self.items_num, self.ins_num = self.config_read(envs.get_global_env("hyper_parameters.config_path", None, self._namespace))
self.items_num, self.ins_num = self.config_read(
envs.get_global_env("hyper_parameters.config_path", None, self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace)
def config_read(self, config_path=None):
if config_path is None:
raise ValueError("please set train.model.hyper_parameters.config_path at first")
......@@ -47,31 +48,31 @@ class Model(ModelBase):
self.items = fluid.data(
name="items",
shape=[bs, -1],
dtype="int64") #[batch_size, uniq_max]
dtype="int64") # [batch_size, uniq_max]
self.seq_index = fluid.data(
name="seq_index",
shape=[bs, -1, 2],
dtype="int32") #[batch_size, seq_max, 2]
dtype="int32") # [batch_size, seq_max, 2]
self.last_index = fluid.data(
name="last_index",
shape=[bs, 2],
dtype="int32") #[batch_size, 2]
dtype="int32") # [batch_size, 2]
self.adj_in = fluid.data(
name="adj_in",
shape=[bs, -1, -1],
dtype="float32") #[batch_size, seq_max, seq_max]
dtype="float32") # [batch_size, seq_max, seq_max]
self.adj_out = fluid.data(
name="adj_out",
shape=[bs, -1, -1],
dtype="float32") #[batch_size, seq_max, seq_max]
dtype="float32") # [batch_size, seq_max, seq_max]
self.mask = fluid.data(
name="mask",
shape=[bs, -1, 1],
dtype="float32") #[batch_size, seq_max, 1]
dtype="float32") # [batch_size, seq_max, 1]
self.label = fluid.data(
name="label",
shape=[bs, 1],
dtype="int64") #[batch_size, 1]
dtype="int64") # [batch_size, 1]
res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label]
return res
......@@ -113,7 +114,7 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, uniq_max, h]
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_out = layers.fc(
input=pre_state,
name="state_out",
......@@ -123,10 +124,10 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, uniq_max, h]
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) #[batch_size, uniq_max, h]
state_adj_out = layers.matmul(self.adj_out, state_out) #[batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
......@@ -154,7 +155,7 @@ class Model(ModelBase):
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, seq_max, h]
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(
input=last,
name="last_fc",
......@@ -164,21 +165,21 @@ class Model(ModelBase):
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[bathc_size, h]
low=-stdv, high=stdv))) # [bathc_size, h]
seq_fc_t = layers.transpose(
seq_fc, perm=[1, 0, 2]) #[seq_max, batch_size, h]
seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h]
add = layers.elementwise_add(
seq_fc_t, last_fc) #[seq_max, batch_size, h]
seq_fc_t, last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter(
shape=[hidden_size],
dtype='float32',
default_initializer=fluid.initializer.Constant(value=0.0)) #[h]
add = layers.elementwise_add(add, b) #[seq_max, batch_size, h]
default_initializer=fluid.initializer.Constant(value=0.0)) # [h]
add = layers.elementwise_add(add, b) # [seq_max, batch_size, h]
add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h]
add_sigmoid = layers.sigmoid(add) # [seq_max, batch_size, h]
add_sigmoid = layers.transpose(
add_sigmoid, perm=[1, 0, 2]) #[batch_size, seq_max, h]
add_sigmoid, perm=[1, 0, 2]) # [batch_size, seq_max, h]
weight = layers.fc(
input=add_sigmoid,
......@@ -189,13 +190,13 @@ class Model(ModelBase):
bias_attr=False,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, seq_max, 1]
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
weight *= self.mask
weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h]
global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h]
weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h]
final_attention = layers.concat(
[global_attention, last], axis=1) #[batch_size, 2*h]
[global_attention, last], axis=1) # [batch_size, 2*h]
final_attention_fc = layers.fc(
input=final_attention,
name="final_attention_fc",
......@@ -203,7 +204,7 @@ class Model(ModelBase):
bias_attr=False,
act=None,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, h]
low=-stdv, high=stdv))) # [batch_size, h]
# all_vocab = layers.create_global_var(
# shape=[items_num - 1],
......@@ -220,13 +221,13 @@ class Model(ModelBase):
name="emb",
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
size=[items_num, hidden_size]) #[all_vocab, h]
size=[items_num, hidden_size]) # [all_vocab, h]
logits = layers.matmul(
x=final_attention_fc, y=all_emb,
transpose_y=True) #[batch_size, all_vocab]
transpose_y=True) # [batch_size, all_vocab]
softmax = layers.softmax_with_cross_entropy(
logits=logits, label=self.label) #[batch_size, 1]
logits=logits, label=self.label) # [batch_size, 1]
self.loss = layers.reduce_mean(softmax) # [1]
self.acc = layers.accuracy(input=logits, label=self.label, k=20)
......
......@@ -11,10 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import io
import copy
import random
import numpy as np
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
......@@ -120,8 +122,9 @@ class TrainReader(Reader):
else:
# Due to fixed batch_size, discard the remaining ins
return
#cur_batch = remain_data[i:]
#yield self.make_data(cur_batch, group_remain % batch_size)
# cur_batch = remain_data[i:]
# yield self.make_data(cur_batch, group_remain % batch_size)
return _reader
def generate_batch_from_trainfiles(self, files):
......@@ -132,4 +135,5 @@ class TrainReader(Reader):
def generate_sample(self, line):
def data_iter():
yield []
return data_iter
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
......@@ -87,10 +86,8 @@ class Model(ModelBase):
self._metrics["cost"] = avg_cost
self._metrics["acc"] = acc
def train_net(self):
self.all_vocab_network()
def infer_net(self):
self.all_vocab_network(is_infer=True)
......@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class EvaluateReader(Reader):
......
......@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
......
......@@ -12,15 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.io as io
import paddle.fluid.layers.control_flow as cf
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class BowEncoder(object):
......@@ -54,6 +51,7 @@ class GrnnEncoder(object):
bias_attr=self.param_name + ".bias")
return fluid.layers.sequence_pool(input=gru_h, pool_type='max')
class PairwiseHingeLoss(object):
def __init__(self, margin=0.8):
self.margin = margin
......@@ -70,6 +68,7 @@ class PairwiseHingeLoss(object):
loss_part2)
return loss_part3
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
......@@ -80,7 +79,6 @@ class Model(ModelBase):
return correct
def train(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
......@@ -129,11 +127,9 @@ class Model(ModelBase):
self._metrics["correct"] = correct
self._metrics["hinge_loss"] = hinge_loss
def train_net(self):
self.train()
def infer(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
......@@ -173,6 +169,5 @@ class Model(ModelBase):
self._infer_results['recall20'] = acc
def infer_net(self):
self.infer()
......@@ -11,19 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import random
import numpy as np
class EvaluateReader(Reader):
def init(self):
self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters")
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
......@@ -39,6 +39,6 @@ class EvaluateReader(Reader):
src = conv_ids[:boundary]
pos_tgt = [conv_ids[boundary]]
feature_name = ["user", "all_item", "p_item"]
yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()]+ [pos_tgt])
yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt])
return reader
......@@ -11,12 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import random
from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
......@@ -25,7 +26,6 @@ class TrainReader(Reader):
def sample_neg_from_seq(self, seq):
return seq[random.randint(0, len(seq) - 1)]
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle.fluid as fluid
......
#! /bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# download train_data
mkdir raw_data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
......
......@@ -13,13 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import math
import os
import random
import re
import six
import argparse
import io
import math
prog = re.compile("[^a-z ]", flags=0)
......@@ -73,7 +75,7 @@ def parse_args():
def text_strip(text):
#English Preprocess Rule
# English Preprocess Rule
return prog.sub("", text.lower())
......@@ -115,7 +117,7 @@ def filter_corpus(args):
word_all_count = 0
id_counts = []
word_id = 0
#read dict
# read dict
with io.open(args.dict_path, 'r', encoding='utf-8') as f:
for line in f:
word, count = line.split()[0], int(line.split()[1])
......@@ -125,13 +127,13 @@ def filter_corpus(args):
id_counts.append(count)
word_all_count += count
#write word2id file
# write word2id file
print("write word2id file to : " + args.dict_path + "_word_to_id_")
with io.open(
args.dict_path + "_word_to_id_", 'w+', encoding='utf-8') as fid:
for k, v in word_to_id_.items():
fid.write(k + " " + str(v) + '\n')
#filter corpus and convert id
# filter corpus and convert id
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
......@@ -200,7 +202,7 @@ def build_dict(args):
for item in item_to_remove:
unk_sum += word_count[item]
del word_count[item]
#sort by count
# sort by count
word_count[native_to_unicode('<UNK>')] = unk_sum
word_count = sorted(
word_count.items(), key=lambda word_count: -word_count[1])
......@@ -228,12 +230,13 @@ def data_split(args):
print("contents: ", str(len(contents)))
print("lines_per_file: ", str(lines_per_file))
for i in range(1, num+1):
for i in range(1, num + 1):
with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i-1)*lines_per_file:min(i*lines_per_file,len(contents))]
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))]
for line in data:
fout.write(line)
if __name__ == "__main__":
args = parse_args()
if args.build_dict:
......
......@@ -11,9 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import io
import six
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
......@@ -47,18 +49,15 @@ class EvaluateReader(Reader):
return True
return False
def _to_unicode(self, s, ignore_errors=False):
if self._is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def strip_lines(self, line, vocab):
return self._replace_oov(vocab, self.native_to_unicode(line))
def _replace_oov(self, original_vocab, line):
"""Replace out-of-vocab words with "<UNK>".
This maintains compatibility with published results.
......@@ -76,5 +75,7 @@ class EvaluateReader(Reader):
def reader():
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), ('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])]
yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]),
('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])]
return reader
......@@ -11,8 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import io
import numpy as np
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
......@@ -84,7 +87,7 @@ class TrainReader(Reader):
output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])]
if not self.with_shuffle_batch:
neg_array = self.cs.searchsorted(np.random.sample(self.neg_num))
output += [('neg_label', [int(str(i)) for i in neg_array ])]
output += [('neg_label', [int(str(i)) for i in neg_array])]
yield output
return reader
return reader
......@@ -14,8 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle.fluid as fluid
import math
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
......@@ -229,7 +229,7 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias."+str(i)),
bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)),
) for i in range(self.max_layers)
]
......@@ -268,8 +268,8 @@ class Model(ModelBase):
num_flatten_dims=2,
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight."+str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(i))
name="cls.concat_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i))
) for i in range(self.max_layers)
]
......@@ -458,7 +458,7 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias."+str(layer_idx)),
name="trans.layer_fc.bias." + str(layer_idx)),
)
return input_layer_fc_out
......@@ -479,6 +479,6 @@ class Model(ModelBase):
num_flatten_dims=2,
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight."+str(layer_idx)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(layer_idx)))
name="cls.concat_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx)))
return hidden_states_fc
......@@ -28,6 +28,7 @@ class EvaluateReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
......
......@@ -28,6 +28,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
......
......@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import subprocess
import tempfile
import argparse
import tempfile
import yaml
from paddlerec.core.factory import TrainerFactory
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
setup for paddle-rec.
"""
import os
from setuptools import setup, find_packages
import tempfile
import shutil
import tempfile
requires = [
"paddlepaddle == 1.7.2",
......@@ -19,7 +36,7 @@ about["__author__"] = "paddle-dev"
about["__author_email__"] = "paddle-dev@baidu.com"
about["__url__"] = "https://github.com/PaddlePaddle/PaddleRec"
readme = "..."
readme = ""
def run_cmd(command):
......
......@@ -12,15 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import os
import time
import platform
import sys
import shutil
import time
import requests
import sys
import tarfile
import zipfile
import platform
import functools
lasttime = time.time()
FLUSH_INTERVAL = 0.1
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册