提交 aa5a08e0 编写于 作者: T tangwei

add ctr-dnn demo

上级 154e5da2
{
"sparse_inputs_slots": 27,
"sparse_feature_number": 1000001,
"sparse_feature_dim": 8,
"dense_input_dim": 13,
"fc_sizes": [400, 400, 40],
"learning_rate": 0.001
}
\ No newline at end of file
class TrainModel(object):
def input(self):
pass
import math
import paddle.fluid as fluid
def net(self):
pass
from ...utils import envs
def net(self):
pass
def loss(self):
pass
class Train(object):
def optimizer(self):
pass
def __init__(self):
self.sparse_inputs = []
self.dense_input = None
self.label_input = None
self.sparse_input_varnames = []
self.dense_input_varname = None
self.label_input_varname = None
class InferModel(object):
def input(self):
pass
def sparse_inputs():
ids = envs.get_global_env("sparse_inputs_counts")
def net(self):
pass
sparse_input_ids = [
fluid.layers.data(name="C" + str(i),
shape=[1],
lod_level=1,
dtype="int64") for i in range(ids)
]
return sparse_input_ids, [var.name for var in sparse_input_ids]
def dense_input():
dense_input_dim = envs.get_global_env("dense_input_dim")
dense_input_var = fluid.layers.data(name="dense_input",
shape=dense_input_dim,
dtype="float32")
return dense_input_var, dense_input_var.name
def label_input():
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
return label, label.name
self.sparse_inputs, self.sparse_input_varnames = sparse_inputs()
self.dense_input, self.dense_input_varname = dense_input()
self.label_input, self.label_input_varname = label_input()
def net(self):
pass
def embedding_layer(input):
sparse_feature_number = envs.get_global_env("sparse_feature_number")
sparse_feature_dim = envs.get_global_env("sparse_feature_dim")
def loss(self):
pass
emb = fluid.layers.embedding(
input=input,
is_sparse=True,
size=[{sparse_feature_number}, {sparse_feature_dim}],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()),
)
emb_sum = fluid.layers.sequence_pool(
input=emb, pool_type='sum')
return emb_sum
def fc(input, output_size):
output = fluid.layers.fc(
input=input, size=output_size,
act='relu', param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(input.shape[1]))))
return output
sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1)
fcs = [concated]
hidden_layers = envs.get_global_env("fc_sizes")
for size in hidden_layers:
fcs.append(fc(fcs[-1], size))
predict = fluid.layers.fc(
input=fcs[-1],
size=2,
act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fcs[-1].shape[1]))),
)
self.predict = predict
def loss(self, predict):
cost = fluid.layers.cross_entropy(input=predict, label=self.label_input)
avg_cost = fluid.layers.reduce_sum(cost)
self.loss = avg_cost
def metric(self):
auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
label=self.label_input,
num_thresholds=2 ** 12,
slide_steps=20)
def optimizer(self):
learning_rate = envs.get_global_env("learning_rate")
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
class Evaluate(object):
def input(self):
pass
def net(self):
pass
def TrainReader():
pass
from ...utils import envs
# There are 13 integer features and 26 categorical features
continous_features = range(1, 14)
categorial_features = range(14, 40)
continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
class CriteoDataset(object):
def __init__(self, sparse_feature_dim):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.cont_diff_ = [
20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.hash_dim_ = sparse_feature_dim
# here, training data are lines with line_index < train_idx_
self.train_idx_ = 41256555
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
def _reader_creator(self, file_list, is_train, trainer_num, trainer_id):
def reader():
for file in file_list:
with open(file, 'r') as f:
line_idx = 0
for line in f:
line_idx += 1
features = line.rstrip('\n').split('\t')
dense_feature = []
sparse_feature = []
for idx in self.continuous_range_:
if features[idx] == '':
dense_feature.append(0.0)
else:
dense_feature.append(
(float(features[idx]) -
self.cont_min_[idx - 1]) /
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
sparse_feature.append([
hash(str(idx) + features[idx]) % self.hash_dim_
])
label = [int(features[0])]
yield [dense_feature] + sparse_feature + [label]
return reader
def train(self, file_list, trainer_num, trainer_id):
return self._reader_creator(file_list, True, trainer_num, trainer_id)
def test(self, file_list):
return self._reader_creator(file_list, False, 1, 0)
def Train():
sparse_feature_number = envs.get_global_env("sparse_feature_number")
train_generator = CriteoDataset(sparse_feature_number)
return train_generator.train
def Evaluate():
sparse_feature_number = envs.get_global_env("sparse_feature_number")
train_generator = CriteoDataset(sparse_feature_number)
return train_generator.test
def InferReader():
pass
此差异已折叠。
此差异已折叠。
import os
def encode_value(v):
return v
def decode_value(v):
return v
def set_global_envs(yaml, envs):
for k, v in yaml.items():
envs[k] = encode_value(v)
def get_global_env(env_name):
"""
get os environment value
"""
if env_name not in os.environ:
raise ValueError("can not find config of {}".format(env_name))
v = os.environ[env_name]
return decode_value(v)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册