提交 be78260d 编写于 作者: O overlord

Merge branch 'develop' of https://github.com/PaddlePaddle/models into listwise_05182140

......@@ -53,8 +53,8 @@ def three_nn(input, known, eps=1e-10, name=None):
.. code-block:: python
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[16, 3], dtype='float32')
known = fluid.layers.data(name='known', shape=[32, 3], dtype='float32')
x = fluid.data(name='x', shape=[None, 16, 3], dtype='float32')
known = fluid.data(name='known', shape=[None, 32, 3], dtype='float32')
distance, idx = fluid.layers.three_nn(input, known)
"""
helper = LayerHelper('three_nn', **locals())
......@@ -97,9 +97,9 @@ def three_interp(input, weight, idx, name=None):
.. code-block:: python
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[16, 3], dtype='float32')
weight = fluid.layers.data(name='weight', shape=[32, 3], dtype='float32')
index = fluid.layers.data(name='index', shape=[32, 3], dtype='int32')
x = fluid.data(name='x', shape=[None, 16, 3], dtype='float32')
weight = fluid.data(name='weight', shape=[None, 32, 3], dtype='float32')
index = fluid.data(name='index', shape=[None, 32, 3], dtype='int32')
out = fluid.layers.three_interp(x, weight, index)
"""
helper = LayerHelper('three_interp', **locals())
......@@ -132,8 +132,8 @@ def query_ball(input, new_points, radius, n_sample):
.. code-block::python
import paddle.fluid as fluid
x = fluid.layers.data(name='points',shape=[-1,5,3],dtype='float32')
new_points = fluid.layers.data(name='new_points', shape=[-1,2,3], dtype='float32')
x = fluid.data(name='points',shape=[None,5,3],dtype='float32')
new_points = fluid.data(name='new_points', shape=[None,2,3], dtype='float32')
output = fluid.layers.query_ball(x,new_points,radius=4.0,n_sample=5)
......@@ -167,7 +167,7 @@ def farthest_point_sampling(input, sampled_point_num):
Examples:
.. code-block:: python
x = fluid.layers.data(name='data', shape=(2,100,3), dtype='float32')
x = fluid.data(name='data', shape=(None ,100, 3), dtype='float32')
sampled_points = fluid.layers.farthest_point_sampling(
x, 50
)
......@@ -210,8 +210,8 @@ def gather_point(input, index):
Examples:
.. code-block:: python
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[-1, 5, 3], dtype='float32')
index = fluid.layers.data(name='index', shape=[-1, 1], dtype='int32')
x = fluid.data(name='x', shape=[None, 5, 3], dtype='float32')
index = fluid.data(name='index', shape=[None, 1], dtype='int32')
output = fluid.layers.gather_point(x, index)
"""
......@@ -249,8 +249,8 @@ def group_points(input, idx, name=None):
.. code-block:: python
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[16, 3], dtype='float32')
index = fluid.layers.data(name='index', shape=[32, 3], dtype='int32')
x = fluid.data(name='x', shape=[None, 16, 3], dtype='float32')
index = fluid.data(name='index', shape=[None, 32, 3], dtype='int32')
out = fluid.layers.group_points(x, index)
"""
helper = LayerHelper('group_points', **locals())
......
......@@ -44,8 +44,8 @@ class TestFarthestPointSamplingOp(unittest.TestCase):
x_type = 'float32'
sampled_point_num = 256
x = fluid.layers.data(
name='x', shape=x_shape, dtype=x_type, append_batch_size=False)
x = fluid.data(
name='x', shape=x_shape, dtype=x_type)
y = pointnet_lib.farthest_point_sampling(x, sampled_point_num)
x_np = np.random.randint(1, 100, (x_shape[0] * x_shape[1] *
......
......@@ -35,10 +35,10 @@ class TestGatherPointOp(unittest.TestCase):
idx_shape = (1, 32)
idx_type = 'int32'
x = fluid.layers.data(
name='x', shape=x_shape, dtype=x_type, append_batch_size=False)
idx = fluid.layers.data(
name='idx', shape=idx_shape, dtype=idx_type, append_batch_size=False)
x = fluid.data(
name='x', shape=x_shape, dtype=x_type)
idx = fluid.data(
name='idx', shape=idx_shape, dtype=idx_type)
y = pointnet_lib.gather_point(x, idx)
x_np = np.random.uniform(-10, 10, x_shape).astype(x_type)
......
......@@ -39,10 +39,10 @@ class TestGroupPointsOp(unittest.TestCase):
idx_shape = [8, 37, 41]
idx_type = 'int32'
x = fluid.layers.data(
name='x', shape=x_shape, dtype=x_type, append_batch_size=False)
idx = fluid.layers.data(
name='idx', shape=idx_shape, dtype=idx_type, append_batch_size=False)
x = fluid.data(
name='x', shape=x_shape, dtype=x_type)
idx = fluid.data(
name='idx', shape=idx_shape, dtype=idx_type)
y = pointnet_lib.group_points(x, idx)
x_np = np.random.uniform(-10, 10, x_shape).astype(x_type)
......
......@@ -48,10 +48,10 @@ class TestQueryBallOp(unittest.TestCase):
radius = 6
nsample = 5
points = fluid.layers.data(
name='points', shape=points_shape, dtype=points_type, append_batch_size=False)
new_points = fluid.layers.data(
name='new_points', shape=new_points_shape, dtype=points_type, append_batch_size=False)
points = fluid.data(
name='points', shape=points_shape, dtype=points_type)
new_points = fluid.data(
name='new_points', shape=new_points_shape, dtype=points_type)
y = pointnet_lib.query_ball(points, new_points, radius, nsample)
points_np = np.random.randint(1, 5, points_shape).astype(points_type)
......
......@@ -42,12 +42,12 @@ class TestThreeInterpOp(unittest.TestCase):
weight_shape = [8, 37, 3]
weight_type = 'float32'
x = fluid.layers.data(
name='x', shape=input_shape, dtype=input_type, append_batch_size=False)
weight = fluid.layers.data(
name='weight', shape=weight_shape, dtype=weight_type, append_batch_size=False)
idx = fluid.layers.data(
name='idx', shape=weight_shape, dtype="int32", append_batch_size=False)
x = fluid.data(
name='x', shape=input_shape, dtype=input_type)
weight = fluid.data(
name='weight', shape=weight_shape, dtype=weight_type)
idx = fluid.data(
name='idx', shape=weight_shape, dtype="int32")
y = pointnet_lib.three_interp(x, weight, idx)
x_np = np.random.random(input_shape).astype(input_type)
......
......@@ -57,10 +57,10 @@ class TestThreeNNOp(unittest.TestCase):
input_type = 'float32'
eps = 1e-10
x = fluid.layers.data(
name='x', shape=input_shape, dtype=input_type, append_batch_size=False)
known = fluid.layers.data(
name='known', shape=known_shape, dtype=input_type, append_batch_size=False)
x = fluid.data(
name='x', shape=input_shape, dtype=input_type)
known = fluid.data(
name='known', shape=known_shape, dtype=input_type)
dist, idx = pointnet_lib.three_nn(x, known, eps)
x_np = np.random.random(input_shape).astype(input_type)
......
......@@ -257,7 +257,7 @@ if __name__ == "__main__":
# cfg.RPN.NMS_TYPE = 'rotate'
proposal_func = get_proposal_func(cfg)
x = fluid.layers.data(name="x", shape=[256, 84], dtype='float32')
x = fluid.data(name="x", shape=[None, 256, 84], dtype='float32')
proposal = fluid.default_main_program().current_block().create_var(
name="proposal", dtype='float32', shape=[256, 7])
fluid.layers.py_func(proposal_func, x, proposal)
......
......@@ -61,18 +61,18 @@ def train(args):
dg_program = fluid.Program()
with fluid.program_guard(d_program):
conditions = fluid.layers.data(
name='conditions', shape=[1], dtype='float32')
img = fluid.layers.data(name='img', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='float32')
conditions = fluid.data(
name='conditions', shape=[None, 1], dtype='float32')
img = fluid.data(name='img', shape=[None, 784], dtype='float32')
label = fluid.data(name='label', shape=[None, 1], dtype='float32')
d_logit = D_cond(img, conditions)
d_loss = loss(d_logit, label)
with fluid.program_guard(dg_program):
conditions = fluid.layers.data(
name='conditions', shape=[1], dtype='float32')
noise = fluid.layers.data(
name='noise', shape=[NOISE_SIZE], dtype='float32')
conditions = fluid.data(
name='conditions', shape=[None, 1], dtype='float32')
noise = fluid.data(
name='noise', shape=[None, NOISE_SIZE], dtype='float32')
g_img = G_cond(z=noise, y=conditions)
g_program = dg_program.clone()
......
......@@ -60,14 +60,14 @@ def train(args):
dg_program = fluid.Program()
with fluid.program_guard(d_program):
img = fluid.layers.data(name='img', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='float32')
img = fluid.data(name='img', shape=[None, 784], dtype='float32')
label = fluid.data(name='label', shape=[None, 1], dtype='float32')
d_logit = D(img)
d_loss = loss(d_logit, label)
with fluid.program_guard(dg_program):
noise = fluid.layers.data(
name='noise', shape=[NOISE_SIZE], dtype='float32')
noise = fluid.data(
name='noise', shape=[None, NOISE_SIZE], dtype='float32')
g_img = G(x=noise)
g_program = dg_program.clone()
......
......@@ -508,9 +508,9 @@ def conv2d_with_filter(input,
groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], \
data = fluid.data(name='data', shape=[None, 3, 32, 32], \
dtype='float32')
filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
filter = fluid.data(name='filter',shape=[None, 10, 3, 3, 3], \
dtype='float32',append_batch_size=False)
conv2d = fluid.layers.conv2d(input=data,
filter=filter,
......
......@@ -52,8 +52,9 @@ def eval(args):
assert model_name in model_list, "{} is not in lists: {}".format(args.model,
model_list)
image = fluid.layers.data(name='image', shape=[None] + image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[None, 1], dtype='int64')
image = fluid.data(name='image', shape=[None] + image_shape, dtype='float32')
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
test_loader = fluid.io.DataLoader.from_generator(
feed_list=[image, label],
capacity=64,
......@@ -75,7 +76,7 @@ def eval(args):
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
fluid.load(program=test_program, model_path=pretrained_model, executor=exe)
test_loader.set_sample_generator(
reader.test(args),
......
......@@ -51,7 +51,7 @@ def infer(args):
assert model_name in model_list, "{} is not in lists: {}".format(args.model,
model_list)
image = fluid.layers.data(name='image', shape=[None] + image_shape, dtype='float32')
image = fluid.data(name='image', shape=[None] + image_shape, dtype='float32')
infer_loader = fluid.io.DataLoader.from_generator(
feed_list=[image],
......@@ -74,7 +74,7 @@ def infer(args):
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
fluid.load(model_path=pretrained_model, program=test_program, executor=exe)
infer_loader.set_sample_generator(
reader.test(args),
......
......@@ -108,9 +108,9 @@ def build_program(is_train, main_prog, startup_prog, args):
model = models.__dict__[args.model]()
with fluid.program_guard(main_prog, startup_prog):
queue_capacity = 64
image = fluid.layers.data(
image = fluid.data(
name='image', shape=[None] + image_shape, dtype='float32')
label = fluid.layers.data(
label = fluid.data(
name='label', shape=[None, 1], dtype='int64')
loader = fluid.io.DataLoader.from_generator(
feed_list=[image, label],
......@@ -190,15 +190,14 @@ def train_async(args):
logging.debug('after run startup program')
if checkpoint is not None:
fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
fluid.load(program=train_prog, model_path=checkpoint, executor=exe)
if pretrained_model:
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist)
fluid.load(program=train_prog, model_path=pretrained_model, executor=exe)
if args.use_gpu:
devicenum = get_gpu_num()
......@@ -287,7 +286,7 @@ def train_async(args):
str(iter_no))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path, main_program=train_prog)
fluid.save(program=train_prog, model_path=model_path)
iter_no += 1
......
......@@ -115,9 +115,9 @@ def build_program(is_train, main_prog, startup_prog, args):
model = models.__dict__[args.model]()
with fluid.program_guard(main_prog, startup_prog):
queue_capacity = 64
image = fluid.layers.data(
image = fluid.data(
name='image', shape=[None] + image_shape, dtype='float32')
label = fluid.layers.data(
label = fluid.data(
name='label', shape=[None, 1], dtype='int64')
loader = fluid.io.DataLoader.from_generator(
feed_list=[image, label],
......@@ -188,15 +188,15 @@ def train_async(args):
logging.debug('after run startup program')
if checkpoint is not None:
fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
fluid.load(program=train_prog, model_path=checkpoint, executor=exe)
if pretrained_model:
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist)
fluid.load(program=train_prog, model_path=pretrained_model, executor=exe)
if args.use_gpu:
devicenum = get_gpu_num()
......@@ -283,7 +283,7 @@ def train_async(args):
str(iter_no))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path, main_program=train_prog)
fluid.save(program=train_prog, model_path=model_path)
iter_no += 1
......
## 注意
注意:该OCR库已迁移至新的github地址:https://github.com/PaddlePaddle/PaddleOCR
该新库包含总模型仅8.6M的超轻量级中文OCR,单模型支持中英文数字组合识别、竖排文本识别、长文本识别。同时支持多种文本检测、文本识别的训练算法。欢迎大家去新的代码仓库中,查看与阅读更多关于OCR的详细介绍以及新功能。
## 代码结构
```
├── data_reader.py # 下载、读取、处理数据。
......
......@@ -72,10 +72,10 @@ def Fconv2d(
groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], \
data = fluid.data(name='data', shape=[None, 3, 32, 32], \
dtype='float32')
filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
dtype='float32',append_batch_size=False)
filter = fluid.data(name='filter',shape=[10,3,3,3], \
dtype='float32')
conv2d = fluid.layers.conv2d(input=data,
filter=filter,
act="relu")
......
......@@ -60,9 +60,9 @@ def Fconv2d(input,
groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], \
data = fluid.data(name='data', shape=[3, 32, 32], \
dtype='float32')
filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
filter = fluid.data(name='filter',shape=[10,3,3,3], \
dtype='float32',append_batch_size=False)
conv2d = fluid.layers.conv2d(input=data,
filter=filter,
......@@ -112,62 +112,4 @@ def Fconv2d(input,
return pre_bias
def test_conv2d_with_filter():
exemplar = np.random.random((8, 4, 6, 6)).astype(np.float32)
instance = np.random.random((8, 4, 22, 22)).astype(np.float32)
# fluid.layers.data(append_batch_size=)
use_gpu = False
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
train_program = fluid.Program()
start_program = fluid.Program()
with fluid.program_guard(train_program, start_program):
x = fluid.layers.data(
name="inst", shape=[8, 4, 22, 22], append_batch_size=False)
y = fluid.layers.data(
name="exem", shape=[8, 4, 6, 6], append_batch_size=False)
bias_att = fluid.ParamAttr(
name="bias_", initializer=fluid.initializer.ConstantInitializer(1.))
out = conv2d_with_filter(x, y, groups=1)
weight_att = fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(exemplar))
bias_att = fluid.ParamAttr(
name="bias", initializer=fluid.initializer.ConstantInitializer(0.))
res = fluid.layers.conv2d(
x,
8,
6,
param_attr=weight_att,
bias_attr=bias_att,
stride=1,
padding=0,
dilation=1)
exe = fluid.Executor(place)
exe.run(program=fluid.default_startup_program())
print(out.shape)
compiled_prog = fluid.compiler.CompiledProgram(train_program)
out, res = exe.run(compiled_prog,
feed={"inst": instance,
"exem": exemplar},
fetch_list=[out.name, res.name])
print(np.sum(out - res))
np.testing.assert_allclose(out, res, rtol=1e-5, atol=0)
with fluid.dygraph.guard():
exem = fluid.dygraph.to_variable(exemplar)
inst = fluid.dygraph.to_variable(instance)
out = conv2d_with_filter(inst, exem, groups=1)
print(np.sum(out.numpy() - res))
np.testing.assert_allclose(out.numpy(), res, rtol=1e-5, atol=0)
if __name__ == '__main__':
test_conv2d_with_filter()
......@@ -3,7 +3,7 @@ from paddle.fluid import layers
from paddle import fluid
from pytracking.libs.tensorlist import TensorList
from pytracking.utils.plotting import plot_graph
from pytracking.libs.paddle_utils import n2p, clone, static_clone
from pytracking.libs.paddle_utils import n2p, clone, static_clone, create_var_list
class L2Problem:
......@@ -243,20 +243,9 @@ class ConjugateGradient(ConjugateGradientBase):
start_program = fluid.Program()
with fluid.program_guard(train_program, start_program):
scope = 'first/'
self.x_ph = TensorList([
fluid.layers.data(
'{}x_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.x)
])
self.p_ph = TensorList([
fluid.layers.data(
'{}p_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.x)
])
self.x_ph = TensorList(create_var_list(scope+"x", self.x, None))
self.p_ph = TensorList(create_var_list(scope+"p", self.x, None))
# problem forward
self.f0 = self.problem(self.x_ph, scope)
......@@ -277,20 +266,10 @@ class ConjugateGradient(ConjugateGradientBase):
start_program2 = fluid.Program()
with fluid.program_guard(train_program2, start_program2):
scope = 'second/'
self.x_ph_2 = TensorList([
fluid.layers.data(
'{}x_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.x)
])
self.dfdx_x_ph = TensorList([
fluid.layers.data(
'{}dfdx_x_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.g)
])
self.x_ph_2 = TensorList(create_var_list(scope+"x", self.x, None))
self.dfdx_x_ph = TensorList(create_var_list(scope+"dfdx_x", self.g, None))
self.f0_2 = self.problem(self.x_ph_2, scope)
self.dfdx_dfdx = TensorList(
......@@ -444,20 +423,9 @@ class GaussNewtonCG(ConjugateGradientBase):
start_program = fluid.Program()
with fluid.program_guard(train_program, start_program):
scope = 'first/'
self.x_ph = TensorList([
fluid.layers.data(
'{}x_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.x)
])
self.p_ph = TensorList([
fluid.layers.data(
'{}p_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.x)
])
self.x_ph = TensorList(create_var_list(scope+"x", self.x, None))
self.p_ph = TensorList(create_var_list(scope+"p", self.x, None))
# problem forward
self.f0 = self.problem(self.x_ph, scope)
......@@ -477,20 +445,9 @@ class GaussNewtonCG(ConjugateGradientBase):
start_program2 = fluid.Program()
with fluid.program_guard(train_program2, start_program2):
scope = 'second/'
self.x_ph_2 = TensorList([
fluid.layers.data(
'{}x_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.x)
])
self.dfdx_x_ph = TensorList([
fluid.layers.data(
'{}dfdx_x_{}'.format(scope, idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.g)
])
self.x_ph_2 = TensorList(create_var_list(scope+"x", self.x, None))
self.dfdx_x_ph = TensorList(create_var_list(scope+"dfdx_x", self.g, None))
self.f0_2 = self.problem(self.x_ph_2, scope)
self.dfdx_dfdx = TensorList(
......@@ -654,13 +611,7 @@ class GradientDescentL2:
train_program = fluid.Program()
start_program = fluid.Program()
with fluid.program_guard(train_program, start_program):
self.x_ph = TensorList([
fluid.layers.data(
'x_{}'.format(idx),
v.shape,
append_batch_size=False,
stop_gradient=False) for idx, v in enumerate(self.x)
])
self.x_ph = TensorList(create_var_list("x", self.x, None))
# problem forward
self.f0 = self.problem(self.x_ph)
......
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid import dygraph
from paddle.fluid import layers
from paddle.fluid.framework import Variable
......@@ -216,3 +217,17 @@ def dropout2d(input, prob, is_train=False):
binary_tensor = layers.floor(random_tensor)
output = input / keep_prob * binary_tensor
return output
def create_var_list(scope, var_lists, shape):
vars = []
for idx, v in enumerate(var_lists):
name = "{}_{}".format(scope, idx)
if shape is None:
var = fluid.data(name, shape=v.shape)
else:
var = fluid.data(name, shape=shape + list(v[0].shape))
var.stop_gradient = False
vars.append(var)
return vars
......@@ -5,6 +5,7 @@ from paddle import fluid
from pytracking.libs import optimization, TensorList, operation
from pytracking.libs.paddle_utils import PTensor, broadcast_op, n2p, static_identity
import math
from pytracking.libs.paddle_utils import create_var_list
def stack_input(e):
......@@ -50,29 +51,18 @@ class FactorizedConvProblem(optimization.L2Problem):
def get_inputs(self, scope=''):
if scope not in self.inputs_dict:
training_samples_p = TensorList([
fluid.layers.data(
'{}training_samples_{}'.format(scope, idx),
shape=[None] + list(v[0].shape),
stop_gradient=False,
append_batch_size=False)
for idx, v in enumerate(self.training_samples)
])
y_p = TensorList([
fluid.layers.data(
'{}y_{}'.format(scope, idx),
shape=[None] + list(v[0].shape),
stop_gradient=False,
append_batch_size=False) for idx, v in enumerate(self.y)
])
sample_weights_p = TensorList([
fluid.layers.data(
'{}sample_weights_{}'.format(scope, idx),
shape=[None, 1],
stop_gradient=False,
append_batch_size=False)
for idx, v in enumerate(self.sample_weights)
])
name = scope + "training_samples"
vars = create_var_list(name, self.sample_weights, [None])
training_samples_p = TensorList(vars)
name = scope + "y"
vars = create_var_list(name, self.y, [None])
y_p = TensorList(vars)
name = scope + "sample_weights"
vars = create_var_list(name, self.sample_weights, [None, 1])
sample_weights_p = TensorList(vars)
self.inputs_dict[scope] = (training_samples_p, y_p,
sample_weights_p)
......@@ -189,29 +179,18 @@ class ConvProblem(optimization.L2Problem):
def get_inputs(self, scope=''):
if scope not in self.inputs_dict:
training_samples_p = TensorList([
fluid.layers.data(
'{}training_samples_{}'.format(scope, idx),
shape=[None] + list(v[0].shape),
stop_gradient=False,
append_batch_size=False)
for idx, v in enumerate(self.training_samples)
])
y_p = TensorList([
fluid.layers.data(
'{}y_{}'.format(scope, idx),
shape=[None] + list(v[0].shape),
stop_gradient=False,
append_batch_size=False) for idx, v in enumerate(self.y)
])
sample_weights_p = TensorList([
fluid.layers.data(
'{}sample_weights_{}'.format(scope, idx),
shape=[None] + list(v[0].shape),
stop_gradient=False,
append_batch_size=False)
for idx, v in enumerate(self.sample_weights)
])
name = scope + "training_samples"
vars = create_var_list(name, self.training_samples, [None])
training_samples_p = TensorList(vars)
name = scope + "y"
vars = create_var_list(name, self.y, [None])
y_p = TensorList(vars)
name = scope + "sample_weights"
vars = create_var_list(name, self.sample_weights, [None])
sample_weights_p = TensorList(vars)
self.inputs_dict[scope] = (training_samples_p, y_p,
sample_weights_p)
......
......@@ -25,6 +25,13 @@
- 提供了适合视频分类和动作定位任务的通用骨架代码,用户可一键式高效配置模型完成训练和评测。
### 推荐用法
- 视频分类共开源7个模型,可分为:端到端模型、序列模型。端到端模型:TSN推荐在时序不敏感视频场景(比如互联网视频场景)使用;TSM、StNet推荐在时序敏感视频场景(比如Kinetics数据集)使用;Non-local模型计算量较大,在科研场景推荐。序列模型:Attention LSTM,Attention Cluster和NeXtVLAD 整体性能接近,但是网络结构不同,推荐集成多个模型使用。
- 视频动作定位共开源3个模型,视频动作定位推荐使用CTCN模型,时序提名生成推荐使用BMN模型。
## 安装
在当前模型库运行样例代码需要PaddlePaddle Fluid v.1.6.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本,请根据[安装文档](http://www.paddlepaddle.org/documentation/docs/zh/1.6/beginners_guide/install/index_cn.html)中的说明来更新PaddlePaddle。
......
......@@ -37,7 +37,7 @@ def calculate_hit_at_one(predictions, actuals):
float: The average hit at one across the entire batch.
"""
top_prediction = numpy.argmax(predictions, 1)
hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
hits = actuals[:, top_prediction]
return numpy.average(hits)
......
......@@ -32,8 +32,7 @@ class ShiftingAttentionModel(object):
x_shape.stop_gradient = True
flat_x = fluid.layers.reshape(x, shape=(-1, self.seg_num))
flat_softmax = fluid.layers.softmax(flat_x)
return fluid.layers.reshape(
flat_softmax, shape=x.shape, actual_shape=x_shape)
return fluid.layers.reshape(flat_softmax, shape=x_shape)
def glorot(self, n):
return np.sqrt(1.0 / np.sqrt(n))
......
......@@ -21,6 +21,7 @@ import json
import logging
import functools
import paddle
import paddle.fluid as fluid
logger = logging.getLogger(__name__)
......@@ -228,8 +229,8 @@ class BMNReader(DataReader):
mapper = functools.partial(process_data, mode=self.mode)
def batch_reader():
xreader = paddle.reader.xmap_readers(mapper, reader,
self.num_threads, 1024)
xreader = fluid.io.xmap_readers(mapper, reader, self.num_threads,
1024)
batch = []
for item in xreader():
batch.append(item)
......
......@@ -22,6 +22,7 @@ import json
import logging
import functools
import paddle
import paddle.fluid as fluid
logger = logging.getLogger(__name__)
from .reader_utils import DataReader
......@@ -214,8 +215,8 @@ class BSNVideoReader(DataReader):
mapper = functools.partial(process_data, mode=self.mode)
def batch_reader():
xreader = paddle.reader.xmap_readers(mapper, reader,
self.num_threads, 1024)
xreader = fluid.io.xmap_readers(mapper, reader, self.num_threads,
1024)
batch = []
for item in xreader():
batch.append(item)
......@@ -444,8 +445,8 @@ class BSNProposalReader(DataReader):
mapper = functools.partial(process_data, mode=self.mode)
def batch_reader():
xreader = paddle.reader.xmap_readers(mapper, reader,
self.num_threads, 1024)
xreader = fluid.io.xmap_readers(mapper, reader, self.num_threads,
1024)
batch = []
for item in xreader():
batch.append(item)
......
......@@ -18,6 +18,7 @@ import sys
import numpy as np
import functools
import paddle
import paddle.fluid as fluid
import logging
logger = logging.getLogger(__name__)
......@@ -154,8 +155,8 @@ class ETSReader(DataReader):
mapper = functools.partial(process_data)
return paddle.reader.xmap_readers(mapper, reader, self.num_threads,
self.buffer_size)
return fluid.io.xmap_readers(mapper, reader, self.num_threads,
self.buffer_size)
def batch_reader():
batch_out = []
......
......@@ -26,7 +26,7 @@ except ImportError:
from io import BytesIO
import numpy as np
import paddle
import paddle.fluid as fluid
try:
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
......@@ -34,6 +34,7 @@ try:
import tempfile
from nvidia.dali.plugin.paddle import DALIGenericIterator
except:
Pipeline = object
print("DALI is not installed, you can improve performance if use DALI")
from PIL import Image, ImageEnhance
......@@ -272,8 +273,7 @@ class KineticsReader(DataReader):
img_mean=img_mean,
img_std=img_std)
return paddle.reader.xmap_readers(mapper, reader_, num_threads,
buf_size)
return fluid.io.xmap_readers(mapper, reader_, num_threads, buf_size)
def build_dali_reader(self):
"""
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""save or load model api"""
import os
import sys
import paddle
import paddle.fluid as fluid
def init_from_pretrain_model(args, exe, program):
assert isinstance(args.init_from_pretrain_model, str)
if not os.path.exists(args.init_from_pretrain_model):
raise Warning("The pretrained params do not exist.")
return False
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(
os.path.join(args.init_from_pretrain_model, var.name))
fluid.io.load_vars(
exe,
args.init_from_pretrain_model,
main_program=program,
predicate=existed_params)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
return True
def init_from_checkpoint(args, exe, program):
assert isinstance(args.init_from_checkpoint, str)
if not os.path.exists(args.init_from_checkpoint):
raise Warning("the checkpoint path does not exist.")
return False
fluid.io.load_persistables(
executor=exe,
dirname=args.init_from_checkpoint,
main_program=program,
filename="checkpoint.pdckpt")
print("finish initing model from checkpoint from %s" %
(args.init_from_checkpoint))
return True
def init_from_params(args, exe, program):
assert isinstance(args.init_from_params, str)
if not os.path.exists(args.init_from_params):
raise Warning("the params path does not exist.")
return False
fluid.io.load_params(
executor=exe,
dirname=args.init_from_params,
main_program=program,
filename="params.pdparams")
print("finish init model from params from %s" % (args.init_from_params))
return True
def save_checkpoint(args, exe, program, dirname):
assert isinstance(args.save_model_path, str)
checkpoint_dir = os.path.join(args.save_model_path, args.save_checkpoint)
if not os.path.exists(checkpoint_dir):
os.mkdir(checkpoint_dir)
fluid.io.save_persistables(
exe,
os.path.join(checkpoint_dir, dirname),
main_program=program,
filename="checkpoint.pdckpt")
print("save checkpoint at %s" % (os.path.join(checkpoint_dir, dirname)))
return True
def save_param(args, exe, program, dirname):
assert isinstance(args.save_model_path, str)
param_dir = os.path.join(args.save_model_path, args.save_param)
if not os.path.exists(param_dir):
os.makedirs(param_dir)
fluid.io.save_params(
exe,
os.path.join(param_dir, dirname),
main_program=program,
filename="params.pdparams")
print("save parameters at %s" % (os.path.join(param_dir, dirname)))
return True
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""save or load model api"""
import os
import sys
import paddle
import paddle.fluid as fluid
def init_from_pretrain_model(args, exe, program):
assert isinstance(args.init_from_pretrain_model, str)
if not os.path.exists(args.init_from_pretrain_model):
raise Warning("The pretrained params do not exist.")
return False
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(
os.path.join(args.init_from_pretrain_model, var.name))
fluid.io.load_vars(
exe,
args.init_from_pretrain_model,
main_program=program,
predicate=existed_params)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
return True
def init_from_checkpoint(args, exe, program):
assert isinstance(args.init_from_checkpoint, str)
if not os.path.exists(args.init_from_checkpoint):
raise Warning("the checkpoint path does not exist.")
return False
fluid.io.load_persistables(
executor=exe,
dirname=args.init_from_checkpoint,
main_program=program,
filename="checkpoint.pdckpt")
print("finish initing model from checkpoint from %s" %
(args.init_from_checkpoint))
return True
def init_from_params(args, exe, program):
assert isinstance(args.init_from_params, str)
if not os.path.exists(args.init_from_params):
raise Warning("the params path does not exist.")
return False
fluid.io.load_params(
executor=exe,
dirname=args.init_from_params,
main_program=program,
filename="params.pdparams")
print("finish init model from params from %s" % (args.init_from_params))
return True
def save_checkpoint(args, exe, program, dirname):
assert isinstance(args.save_model_path, str)
checkpoint_dir = os.path.join(args.save_model_path, args.save_checkpoint)
if not os.path.exists(checkpoint_dir):
os.mkdir(checkpoint_dir)
fluid.io.save_persistables(
exe,
os.path.join(checkpoint_dir, dirname),
main_program=program,
filename="checkpoint.pdckpt")
print("save checkpoint at %s" % (os.path.join(checkpoint_dir, dirname)))
return True
def save_param(args, exe, program, dirname):
assert isinstance(args.save_model_path, str)
param_dir = os.path.join(args.save_model_path, args.save_param)
if not os.path.exists(param_dir):
os.makedirs(param_dir)
fluid.io.save_params(
exe,
os.path.join(param_dir, dirname),
main_program=program,
filename="params.pdparams")
print("save parameters at %s" % (os.path.join(param_dir, dirname)))
return True
......@@ -105,15 +105,15 @@ def create_pyreader(args,
# create lac pyreader
if mode == 'train':
pyreader.set_sample_list_generator(
paddle.batch(
paddle.reader.shuffle(
fluid.io.batch(
fluid.io.shuffle(
reader.file_reader(file_name),
buf_size=args.traindata_shuffle_buffer),
batch_size=args.batch_size / device_count),
places=place)
else:
pyreader.set_sample_list_generator(
paddle.batch(
fluid.io.batch(
reader.file_reader(
file_name, mode=mode),
batch_size=args.batch_size / device_count),
......
......@@ -32,7 +32,7 @@
1. paddle安装
本项目依赖于 PaddlePaddle 1.6及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 1.8及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装
2. 下载代码
......@@ -44,7 +44,7 @@
3. 环境依赖
请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)部分的内容
请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)部分的内容
### 数据准备
......
......@@ -752,18 +752,17 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
# caches contains states of history steps in decoder self-attention
# and static encoder output projections in encoder-decoder attention
# to reduce redundant computation.
batch_size = layers.shape(start_tokens)[0]
caches = [
{
"k": # for self attention
layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, n_head, 0, d_key],
layers.fill_constant(
shape=[batch_size, n_head, 0, d_key],
dtype=enc_output.dtype,
value=0),
"v": # for self attention
layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, n_head, 0, d_value],
layers.fill_constant(
shape=[batch_size, n_head, 0, d_value],
dtype=enc_output.dtype,
value=0),
"static_k": # for encoder-decoder attention
......@@ -792,12 +791,10 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
lambda x: layers.gather(x, index=gather_idx), caches)
pre_src_attn_bias = layers.gather(
trg_src_attn_bias, index=gather_idx)
bias_batch_size = layers.shape(pre_src_attn_bias)[0]
pre_pos = layers.elementwise_mul(
x=layers.fill_constant_batch_size_like(
input=pre_src_attn_bias, # cann't use lod tensor here
value=1,
shape=[-1, 1],
dtype=pre_ids.dtype),
x=layers.fill_constant(
value=1, shape=[bias_batch_size, 1], dtype=pre_ids.dtype),
y=step_idx,
axis=0)
logits = wrap_decoder(
......
......@@ -210,7 +210,7 @@ class DataLayer(object):
"""
operation
"""
data = fluid.layers.data(
data = fluid.data(
name=name, shape=shape, dtype=dtype, lod_level=lod_level)
return data
......@@ -383,8 +383,10 @@ class ConstantLayer(object):
"""
operation
"""
constant = fluid.layers.fill_constant_batch_size_like(input, shape,
dtype, value)
shape = list(shape)
input_shape = fluid.layers.shape(input)
shape[0] = input_shape[0]
constant = fluid.layers.fill_constant(shape, dtype, value)
return constant
......
......@@ -22,7 +22,7 @@
|UNICOM|联通客服|客服|
## 快速开始
#### 版本依赖
本项目依赖于 Paddlepaddle Fluid 1.6,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
本项目依赖于 Paddlepaddle Fluid 1.8,请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)进行安装。
python版本依赖python 2.7
#### 安装代码
......
......@@ -47,46 +47,51 @@ from models.model_check import check_version
from models.model_check import check_cuda
def create_model(args, pyreader_name, is_inference=False, is_pointwise=False):
def create_model(args, is_inference=False, is_pointwise=False):
"""
Create Model for simnet
"""
if is_inference:
inf_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1], [-1]),
dtypes=('int64', 'int64'),
lod_levels=(1, 1),
name=pyreader_name,
use_double_buffer=False)
left = fluid.data(name='left', shape=[None], dtype='int64', lod_level=1)
pos_right = fluid.data(
name='pos_right', shape=[None], dtype='int64', lod_level=1)
inf_loader = fluid.io.DataLoader.from_generator(
capacity=16,
feed_list=[left, pos_right],
iterable=False,
use_double_buffer=False)
left, pos_right = fluid.layers.read_file(inf_pyreader)
return inf_pyreader, left, pos_right
return inf_loader, left, pos_right
else:
if is_pointwise:
pointwise_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1], [-1], [-1]),
dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 0),
name=pyreader_name,
use_double_buffer=False)
left, right, label = fluid.layers.read_file(pointwise_pyreader)
return pointwise_pyreader, left, right, label
left = fluid.data(
name='left', shape=[None], dtype='int64', lod_level=1)
right = fluid.data(
name='right', shape=[None], dtype='int64', lod_level=1)
label = fluid.data(name='label', shape=[None], dtype='int64')
pointwise_loader = fluid.io.DataLoader.from_generator(
capacity=16,
feed_list=[left, right, label],
iterable=False,
use_double_buffer=False)
return pointwise_loader, left, right, label
else:
pairwise_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1], [-1], [-1]),
dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 1),
name=pyreader_name,
use_double_buffer=False)
left = fluid.data(
name='left', shape=[None], dtype='int64', lod_level=1)
pos_right = fluid.data(
name='pos_right', shape=[None], dtype='int64', lod_level=1)
neg_right = fluid.data(
name='neg_right', shape=[None], dtype='int64', lod_level=1)
pairwise_loader = fluid.io.DataLoader.from_generator(
capacity=16,
feed_list=[left, pos_right, neg_right],
iterable=False,
use_double_buffer=False)
left, pos_right, neg_right = fluid.layers.read_file(pairwise_pyreader)
return pairwise_pyreader, left, pos_right, neg_right
return pairwise_loader, left, pos_right, neg_right
def train(conf_dict, args):
......@@ -131,8 +136,7 @@ def train(conf_dict, args):
# Build network
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, left, pos_right, neg_right = create_model(
args, pyreader_name='train_reader')
train_loader, left, pos_right, neg_right = create_model(args)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
_, neg_score = net.predict(left, neg_right)
......@@ -147,8 +151,8 @@ def train(conf_dict, args):
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, pos_right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, pos_right = create_model(
args, is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
......@@ -157,8 +161,8 @@ def train(conf_dict, args):
# Build network
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, left, right, label = create_model(
args, pyreader_name='train_reader', is_pointwise=True)
train_loader, left, right, label = create_model(
args, is_pointwise=True)
left_feat, pred = net.predict(left, right)
avg_cost = loss.compute(pred, label)
avg_cost.persistable = True
......@@ -171,15 +175,15 @@ def train(conf_dict, args):
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, right = create_model(
args, is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
if args.init_checkpoint is not "":
utils.init_checkpoint(exe, args.init_checkpoint, startup_prog)
def valid_and_test(test_program, test_pyreader, get_valid_examples, process,
def valid_and_test(test_program, test_loader, get_valid_examples, process,
mode, exe, fetch_list):
"""
return auc and acc
......@@ -187,15 +191,15 @@ def train(conf_dict, args):
# Get Batch Data
batch_data = fluid.io.batch(
get_valid_examples, args.batch_size, drop_last=False)
test_pyreader.decorate_paddle_reader(batch_data)
test_pyreader.start()
test_loader.set_sample_list_generator(batch_data)
test_loader.start()
pred_list = []
while True:
try:
_pred = exe.run(program=test_program, fetch_list=[pred.name])
pred_list += list(_pred)
except fluid.core.EOFException:
test_pyreader.reset()
test_loader.reset()
break
pred_list = np.vstack(pred_list)
if mode == "test":
......@@ -233,8 +237,8 @@ def train(conf_dict, args):
get_train_examples, buf_size=10000),
args.batch_size,
drop_last=False)
train_pyreader.decorate_paddle_reader(train_batch_data)
train_pyreader.start()
train_loader.set_sample_list_generator(train_batch_data)
train_loader.start()
exe.run(startup_prog)
losses = []
start_time = time.time()
......@@ -248,8 +252,8 @@ def train(conf_dict, args):
if args.do_valid and global_step % args.validation_steps == 0:
get_valid_examples = simnet_process.get_reader("valid")
valid_result = valid_and_test(
test_prog, test_pyreader, get_valid_examples,
simnet_process, "valid", exe, [pred.name])
test_prog, test_loader, get_valid_examples, simnet_process,
"valid", exe, [pred.name])
if args.compute_accuracy:
valid_auc, valid_acc = valid_result
logging.info(
......@@ -281,7 +285,7 @@ def train(conf_dict, args):
logging.info("saving infer model in %s" % model_path)
except fluid.core.EOFException:
train_pyreader.reset()
train_loader.reset()
break
end_time = time.time()
#logging.info("epoch: %d, loss: %f, used time: %d sec" %
......@@ -327,9 +331,8 @@ def train(conf_dict, args):
else:
# Get Feeder and Reader
get_test_examples = simnet_process.get_reader("test")
test_result = valid_and_test(test_prog, test_pyreader,
get_test_examples, simnet_process, "test",
exe, [pred.name])
test_result = valid_and_test(test_prog, test_loader, get_test_examples,
simnet_process, "test", exe, [pred.name])
if args.compute_accuracy:
test_auc, test_acc = test_result
logging.info("AUC of test is %f, Accuracy of test is %f" %
......@@ -371,8 +374,8 @@ def test(conf_dict, args):
if args.task_mode == "pairwise":
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, pos_right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, pos_right = create_model(
args, is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
......@@ -380,8 +383,8 @@ def test(conf_dict, args):
else:
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, right = create_model(
args, is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
......@@ -390,10 +393,10 @@ def test(conf_dict, args):
utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog)
test_exe = exe
test_pyreader.decorate_paddle_reader(batch_data)
test_loader.set_sample_list_generator(batch_data)
logging.info("start test process ...")
test_pyreader.start()
test_loader.start()
pred_list = []
fetch_list = [pred.name]
output = []
......@@ -412,7 +415,7 @@ def test(conf_dict, args):
map(lambda item: str(np.argmax(item)), output[0])) +
"\n")
except fluid.core.EOFException:
test_pyreader.reset()
test_loader.reset()
break
if args.task_mode == "pairwise":
pred_list = np.array(pred_list).reshape((-1, 1))
......@@ -468,16 +471,16 @@ def infer(conf_dict, args):
if args.task_mode == "pairwise":
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
infer_pyreader, left, pos_right = create_model(
args, pyreader_name='infer_reader', is_inference=True)
infer_loader, left, pos_right = create_model(
args, is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else:
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
infer_pyreader, left, right = create_model(
args, pyreader_name='infer_reader', is_inference=True)
infer_loader, left, right = create_model(
args, is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
......@@ -486,13 +489,13 @@ def infer(conf_dict, args):
utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog)
test_exe = exe
infer_pyreader.decorate_sample_list_generator(batch_data)
infer_loader.set_sample_list_generator(batch_data)
logging.info("start test process ...")
preds_list = []
fetch_list = [pred.name]
output = []
infer_pyreader.start()
infer_loader.start()
while True:
try:
output = test_exe.run(program=test_prog, fetch_list=fetch_list)
......@@ -502,7 +505,7 @@ def infer(conf_dict, args):
else:
preds_list += map(lambda item: str(np.argmax(item)), output[0])
except fluid.core.EOFException:
infer_pyreader.reset()
infer_loader.reset()
break
with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file:
for _data, _pred in zip(simnet_process.get_infer_data(), preds_list):
......
# Paddle_baseline_KDD2019
Paddle baseline for KDD2019 "Context-Aware Multi-Modal Transportation Recommendation"(https://dianshi.baidu.com/competition/29/question)
This repository is the demo codes for the KDD2019 "Context-Aware Multi-Modal Transportation Recommendation" competition using PaddlePaddle. It is written by python and uses PaddlePaddle to solve the task. Note that this repository is on developing and welcome everyone to contribute. The current baseline solution codes can get 0.68 - 0.69 score of online submission. As an example, my submission based on these networks programmed by PaddlePaddle is 0.6898.
The reason of the publication of this baseline codes is to encourage us to use PaddlePaddle and build the most powerful recommendation model via PaddlePaddle.
The example codes are ran on Linux, python2.7, single machine with CPU . Note that distributed train options are not provided here, if you want to learn more about this, please check more modes examples on https://github.com/PaddlePaddle/models. About the speed of training, for one epoch, 1000 batch size, it would take about 8 mins to train the whole training instances generated from raw data using SGD optimizer (it would take relatively longer using Adam optimizer).
The configuration and process of all the networks are fundamental, a lot of optimizations can be done based on them to achieve better results e.g. better cost function, more powerful feature engineering, designed model validation, NN optimization tricks...
The code is rough and from my daily use. They will be trimmed these days...
## Install PaddlePaddle
please visit the official site of PaddlePaddle(http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html)
## preprocess feature
```python
python preprocess_dense.py # change for different feature strategy
python pre_test_dense.py
```
preprocess.py and preprocess_dense.py is the code for preprocessing the raw data. Two versions are provided to deal with all sparse features and sparse plus dense features. Correspondingly, pre_process_test.py and pre_test_dense.py are the codes to preproccess test raw data. The training instances are saved in json. It is very easy to add new features. In our demo, all features are generated from provided raw data except for weather feature, which is gengerated from open weather records.
Note that the feature generated in this step need to fit in the input of the model input. Make sure we use the right version. In demo codes, The sparse plus dense features are used for network_confv6.
## build the network
main network logic is in network_confv?.py. The networks are base on fm & deep related algorithms. I try several networks and public some of them. There may be some defects in the networks but all of them are functional.
## train the network
```python
python local_train.py
```
In local_train.py and map_reader.py, I use dataset API, so we need to download the corresponding .whl package or clone codes on develop branch of PaddlePaddle. The reason to use this is the speed of feeding data is much faster.
Note that the input format feed into the network is self-defined. make sure we build the same format between training and test.
## test results
```python
python generate_test.py
python build_submit.py
```
In generate_test.py and build_submit, for convenience, I use the whole train data to train the network and test the network with provided data without label
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
parser.add_argument(
'--train_data_path',
type=str,
default='./data/raw/train.txt',
help="The path of training dataset")
parser.add_argument(
'--test_data_path',
type=str,
default='./data/raw/valid.txt',
help="The path of testing dataset")
parser.add_argument(
'--batch_size',
type=int,
default=1000,
help="The size of mini-batch (default:1000)")
parser.add_argument(
'--embedding_size',
type=int,
default=16,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--num_passes',
type=int,
default=10,
help="The number of passes to train (default: 10)")
parser.add_argument(
'--model_output_dir',
type=str,
default='models',
help='The path for model to store (default: models)')
parser.add_argument(
'--sparse_feature_dim',
type=int,
default=1000001,
help='sparse feature hashing space for index processing')
parser.add_argument(
'--is_local',
type=int,
default=1,
help='Local train or distributed train (default: 1)')
parser.add_argument(
'--cloud_train',
type=int,
default=0,
help='Local train or distributed train on paddlecloud (default: 0)')
parser.add_argument(
'--async_mode',
action='store_true',
default=False,
help='Whether start pserver in async mode to support ASGD')
parser.add_argument(
'--no_split_var',
action='store_true',
default=False,
help='Whether split variables into blocks when update_method is pserver')
parser.add_argument(
'--role',
type=str,
default='pserver', # trainer or pserver
help='The path for model to store (default: models)')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The path for model to store (default: 127.0.0.1:6000)')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='The path for model to store (default: models)')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
return parser.parse_args()
import json
import csv
import io
def build():
submit_map = {}
with io.open('./submit/submit.csv', 'wb') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow(['sid', 'recommend_mode'])
with open('./out/normed_test_session.txt', 'r') as f1:
with open('./testres/res8', 'r') as f2:
cur_session =''
for x, y in zip(f1.readlines(), f2.readlines()):
m1 = json.loads(x)
session_id = m1["session_id"]
if cur_session == '':
cur_session = session_id
transport_mode = m1["plan"]["transport_mode"]
if cur_session != session_id:
writer.writerow([str(cur_session), str(submit_map[cur_session]["transport_mode"])])
cur_session = session_id
if session_id not in submit_map:
submit_map[session_id] = {}
submit_map[session_id]["transport_mode"] = transport_mode
submit_map[session_id]["probability"] = y
#if int(submit_map[session_id]["transport_mode"]) == 0 and submit_map[session_id]["probability"] > 0.02:
#submit_map[session_id]["probability"] = 0.99
else:
if float(y) > float(submit_map[session_id]["probability"]):
submit_map[session_id]["transport_mode"] = transport_mode
submit_map[session_id]["probability"] = y
#if int(submit_map[session_id]["transport_mode"]) == 0 and submit_map[session_id]["probability"] > 0.02:
#submit_map[session_id]["transport_mode"] = 0
#submit_map[session_id]["probability"] = 0.99
writer.writerow([cur_session, submit_map[cur_session]["transport_mode"]])
if __name__ == "__main__":
build()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import numpy as np
# disable gpu training for this example
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle
import paddle.fluid as fluid
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
num_context_feature = 22
def parse_args():
parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example")
parser.add_argument(
'--model_path',
type=str,
#required=True,
default='models',
help="The path of model parameters gz file")
parser.add_argument(
'--data_path',
type=str,
required=False,
help="The path of the dataset to infer")
parser.add_argument(
'--embedding_size',
type=int,
default=16,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--sparse_feature_dim',
type=int,
default=1000001,
help="The size for embedding layer (default:1000001)")
parser.add_argument(
'--batch_size',
type=int,
default=1000,
help="The size of mini-batch (default:1000)")
return parser.parse_args()
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def data2tensor(data, place):
feed_dict = {}
dense = data[0]
sparse = data[1:-1]
y = data[-1]
#user_data = np.array([x[0] for x in data]).astype("float32")
#user_data = user_data.reshape([-1, 10])
#feed_dict["user_profile"] = user_data
dense_data = np.array([x[0] for x in data]).astype("float32")
dense_data = dense_data.reshape([-1, 3])
feed_dict["dense_feature"] = dense_data
for i in range(num_context_feature):
sparse_data = to_lodtensor([x[1 + i] for x in data], place)
feed_dict["context" + str(i)] = sparse_data
context_fm = to_lodtensor(
np.array([x[-2] for x in data]).astype("float32"), place)
feed_dict["context_fm"] = context_fm
y_data = np.array([x[-1] for x in data]).astype("int64")
y_data = y_data.reshape([-1, 1])
feed_dict["label"] = y_data
return feed_dict
def test():
args = parse_args()
place = fluid.CPUPlace()
test_scope = fluid.core.Scope()
# filelist = ["%s/%s" % (args.data_path, x) for x in os.listdir(args.data_path)]
from map_reader import MapDataset
map_dataset = MapDataset()
map_dataset.setup(args.sparse_feature_dim)
exe = fluid.Executor(place)
whole_filelist = ["./out/normed_test_session.txt"]
test_files = whole_filelist[int(0.0 * len(whole_filelist)):int(1.0 * len(
whole_filelist))]
epochs = 1
for i in range(epochs):
cur_model_path = os.path.join(args.model_path,
"epoch" + str(1) + ".model")
with open("./testres/res" + str(i), 'w') as r:
with fluid.scope_guard(test_scope):
[inference_program, feed_target_names, fetch_targets] = \
fluid.io.load_inference_model(cur_model_path, exe)
test_reader = map_dataset.test_reader(test_files, 1000, 100000)
k = 0
for batch_id, data in enumerate(test_reader()):
print(len(data[0]))
feed_dict = data2tensor(data, place)
loss_val, auc_val, accuracy, predict, _ = exe.run(
inference_program,
feed=feed_dict,
fetch_list=fetch_targets,
return_numpy=False)
x = np.array(predict)
for j in range(x.shape[0]):
r.write(str(x[j][1]))
r.write("\n")
if __name__ == '__main__':
test()
import argparse
import logging
import numpy as np
# disable gpu training for this example
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle
import paddle.fluid as fluid
import map_reader
from network_conf import ctr_deepfm_dataset
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example")
parser.add_argument(
'--model_path',
type=str,
#required=True,
default='models',
help="The path of model parameters gz file")
parser.add_argument(
'--data_path',
type=str,
required=False,
help="The path of the dataset to infer")
parser.add_argument(
'--embedding_size',
type=int,
default=16,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--sparse_feature_dim',
type=int,
default=1000001,
help="The size for embedding layer (default:1000001)")
parser.add_argument(
'--batch_size',
type=int,
default=1000,
help="The size of mini-batch (default:1000)")
return parser.parse_args()
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def data2tensor(data, place):
feed_dict = {}
test_dict = {}
dense = data[0]
sparse = data[1:-1]
y = data[-1]
dense_data = np.array([x[0] for x in data]).astype("float32")
dense_data = dense_data.reshape([-1, 65])
feed_dict["user_profile"] = dense_data
for i in range(10):
sparse_data = to_lodtensor([x[1 + i] for x in data], place)
feed_dict["context" + str(i)] = sparse_data
y_data = np.array([x[-1] for x in data]).astype("int64")
y_data = y_data.reshape([-1, 1])
feed_dict["label"] = y_data
test_dict["test"] = [1]
return feed_dict, test_dict
def infer():
args = parse_args()
place = fluid.CPUPlace()
inference_scope = fluid.core.Scope()
filelist = [
"%s/%s" % (args.data_path, x) for x in os.listdir(args.data_path)
]
from map_reader import MapDataset
map_dataset = MapDataset()
map_dataset.setup(args.sparse_feature_dim)
exe = fluid.Executor(place)
whole_filelist = [
"raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
]
#whole_filelist = ["./out/normed_train09", "./out/normed_train10", "./out/normed_train11"]
test_files = whole_filelist[int(0.0 * len(whole_filelist)):int(1.0 * len(
whole_filelist))]
# file_groups = [whole_filelist[i:i+train_thread_num] for i in range(0, len(whole_filelist), train_thread_num)]
def set_zero(var_name):
param = inference_scope.var(var_name).get_tensor()
param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
epochs = 2
for i in range(epochs):
cur_model_path = os.path.join(args.model_path,
"epoch" + str(i + 1) + ".model")
with fluid.scope_guard(inference_scope):
[inference_program, feed_target_names, fetch_targets] = \
fluid.io.load_inference_model(cur_model_path, exe)
auc_states_names = ['_generated_var_2', '_generated_var_3']
for name in auc_states_names:
set_zero(name)
test_reader = map_dataset.infer_reader(test_files, 1000, 100000)
for batch_id, data in enumerate(test_reader()):
loss_val, auc_val, accuracy, predict, label = exe.run(
inference_program,
feed=data2tensor(data, place),
fetch_list=fetch_targets,
return_numpy=False)
#print(np.array(predict))
#x = np.array(predict)
#print(.shape)x
#print("train_pass_%d, test_pass_%d\t%f\t" % (i - 1, i, auc_val))
if __name__ == '__main__':
infer()
from __future__ import print_function
from args import parse_args
import os
import paddle.fluid as fluid
import sys
from network_confv6 import ctr_deepfm_dataset
NUM_CONTEXT_FEATURE = 22
DIM_USER_PROFILE = 10
DIM_DENSE_FEATURE = 3
PYTHON_PATH = "/home/yaoxuefeng/whls/paddle_release_home/python/bin/python" # this is mine change yours
def train():
args = parse_args()
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
#set the input format for our model. Note that you need to carefully modify them when you define a new network
#user_profile = fluid.layers.data(
#name="user_profile", shape=[DIM_USER_PROFILE], dtype='int64', lod_level=1)
dense_feature = fluid.layers.data(
name="dense_feature", shape=[DIM_DENSE_FEATURE], dtype='float32')
context_feature = [
fluid.layers.data(
name="context" + str(i), shape=[1], lod_level=1, dtype="int64")
for i in range(0, NUM_CONTEXT_FEATURE)
]
context_feature_fm = fluid.layers.data(
name="context_fm", shape=[1], dtype='int64', lod_level=1)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
print("ready to network")
#self define network
loss, auc_var, batch_auc_var, accuracy, predict = ctr_deepfm_dataset(
dense_feature, context_feature, context_feature_fm, label,
args.embedding_size, args.sparse_feature_dim)
print("ready to optimize")
optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
optimizer.minimize(loss)
#single machine CPU training. more options on trainig please visit PaddlePaddle site
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
#use dataset api for much faster speed
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var([dense_feature] + context_feature +
[context_feature_fm] + [label])
#self define how to process generated training insatnces in map_reader.py
pipe_command = PYTHON_PATH + " map_reader.py %d" % args.sparse_feature_dim
dataset.set_pipe_command(pipe_command)
dataset.set_batch_size(args.batch_size)
thread_num = 1
dataset.set_thread(thread_num)
#self define how to split training files for example:"split -a 2 -d -l 200000 normed_train.txt normed_train"
whole_filelist = [
"./out/normed_train%d" % x for x in range(len(os.listdir("out")))
]
whole_filelist = [
"./out/normed_train00", "./out/normed_train01", "./out/normed_train02",
"./out/normed_train03", "./out/normed_train04", "./out/normed_train05",
"./out/normed_train06", "./out/normed_train07", "./out/normed_train08",
"./out/normed_train09", "./out/normed_train10", "./out/normed_train11"
]
print("ready to epochs")
epochs = 10
for i in range(epochs):
print("start %dth epoch" % i)
dataset.set_filelist(whole_filelist[:int(len(whole_filelist))])
#print the informations you want by setting fetch_list and fetch_info
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=[auc_var, accuracy, predict, label],
fetch_info=["auc", "accuracy", "predict", "label"],
debug=False)
model_dir = os.path.join(args.model_output_dir,
'/epoch' + str(i + 1) + ".model")
sys.stderr.write("epoch%d finished" % (i + 1))
#save model
fluid.io.save_inference_model(
model_dir,
[dense_feature.name] + [x.name for x in context_feature] +
[context_feature_fm.name] + [label.name],
[loss, auc_var, accuracy, predict, label], exe)
if __name__ == '__main__':
train()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import json
import paddle.fluid.incubate.data_generator as dg
class MapDataset(dg.MultiSlotDataGenerator):
def setup(self, sparse_feature_dim):
self.profile_length = 65
self.dense_length = 3
#feature names
self.dense_feature_list = ["distance", "price", "eta"]
self.pid_list = ["pid"]
self.query_feature_list = ["weekday", "hour", "o1", "o2", "d1", "d2"]
self.plan_feature_list = ["transport_mode"]
self.rank_feature_list = ["plan_rank", "whole_rank", "price_rank", "eta_rank", "distance_rank"]
self.rank_whole_pic_list = ["mode_rank1", "mode_rank2", "mode_rank3", "mode_rank4",
"mode_rank5"]
self.weather_feature_list = ["max_temp", "min_temp", "wea", "wind"]
self.hash_dim = 1000001
self.train_idx_ = 2000000
#carefully set if you change the features
self.categorical_range_ = range(0, 22)
#process one instance
def _process_line(self, line):
instance = json.loads(line)
"""
profile = instance["profile"]
len_profile = len(profile)
if len_profile >= 10:
user_profile_feature = profile[0:10]
else:
profile.extend([0]*(10-len_profile))
user_profile_feature = profile
if len(profile) > 1 or (len(profile) == 1 and profile[0] != 0):
for p in profile:
if p >= 1 and p <= 65:
user_profile_feature[p - 1] = 1
"""
context_feature = []
context_feature_fm = []
dense_feature = [0] * self.dense_length
plan = instance["plan"]
for i, val in enumerate(self.dense_feature_list):
dense_feature[i] = plan[val]
if (instance["pid"] == ""):
instance["pid"] = 0
query = instance["query"]
weather_dic = instance["weather"]
for fea in self.pid_list:
context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
for fea in self.query_feature_list:
context_feature.append([hash(fea + str(query[fea])) % self.hash_dim])
context_feature_fm.append(hash(fea + str(query[fea])) % self.hash_dim)
for fea in self.plan_feature_list:
context_feature.append([hash(fea + str(plan[fea])) % self.hash_dim])
context_feature_fm.append(hash(fea + str(plan[fea])) % self.hash_dim)
for fea in self.rank_feature_list:
context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
for fea in self.rank_whole_pic_list:
context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
for fea in self.weather_feature_list:
context_feature.append([hash(fea + str(weather_dic[fea])) % self.hash_dim])
context_feature_fm.append(hash(fea + str(weather_dic[fea])) % self.hash_dim)
label = [int(instance["label"])]
return dense_feature, context_feature, context_feature_fm, label
def infer_reader(self, filelist, batch, buf_size):
print(filelist)
def local_iter():
for fname in filelist:
with open(fname.strip(), "r") as fin:
for line in fin:
dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
yield [dense_feature] + sparse_feature + [sparse_feature_fm] + [label]
import paddle
batch_iter = paddle.batch(
paddle.reader.shuffle(
local_iter, buf_size=buf_size),
batch_size=batch)
return batch_iter
#generat inputs for testing
def test_reader(self, filelist, batch, buf_size):
print(filelist)
def local_iter():
for fname in filelist:
with open(fname.strip(), "r") as fin:
for line in fin:
dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
yield [dense_feature] + sparse_feature + [sparse_feature_fm] + [label]
import paddle
batch_iter = paddle.batch(
paddle.reader.buffered(
local_iter, size=buf_size),
batch_size=batch)
return batch_iter
#generate inputs for trainig
def generate_sample(self, line):
def data_iter():
dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
#feature_name = ["user_profile"]
feature_name = []
feature_name.append("dense_feature")
for idx in self.categorical_range_:
feature_name.append("context" + str(idx))
feature_name.append("context_fm")
feature_name.append("label")
yield zip(feature_name, [dense_feature] + sparse_feature + [sparse_feature_fm] + [label])
return data_iter
if __name__ == "__main__":
map_dataset = MapDataset()
map_dataset.setup(int(sys.argv[1]))
map_dataset.run_from_stdin()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import math
user_profile_dim = 65
dense_feature_dim = 3
def ctr_deepfm_dataset(dense_feature, context_feature, context_feature_fm, label,
embedding_size, sparse_feature_dim):
def dense_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
first_order = fluid.layers.fc(input=input, size=1)
emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
dtype='float32', attr=fm_param_attr)
input_mul_factor = fluid.layers.matmul(input, emb_table)
input_mul_factor_square = fluid.layers.square(input_mul_factor)
input_square = fluid.layers.square(input)
factor_square = fluid.layers.square(emb_table)
input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
second_order = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
return first_order, second_order
dense_fm_param_attr = fluid.param_attr.ParamAttr(name="DenseFeatFactors",
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(dense_feature_dim)))
dense_fm_first, dense_fm_second = dense_fm_layer(
dense_feature, dense_feature_dim, 16, dense_fm_param_attr)
def sparse_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
first_embeddings = fluid.layers.embedding(
input=input, dtype='float32', size=[emb_dict_size, 1], is_sparse=True)
first_order = fluid.layers.sequence_pool(input=first_embeddings, pool_type='sum')
nonzero_embeddings = fluid.layers.embedding(
input=input, dtype='float32', size=[emb_dict_size, factor_size],
param_attr=fm_param_attr, is_sparse=True)
summed_features_emb = fluid.layers.sequence_pool(input=nonzero_embeddings, pool_type='sum')
summed_features_emb_square = fluid.layers.square(summed_features_emb)
squared_features_emb = fluid.layers.square(nonzero_embeddings)
squared_sum_features_emb = fluid.layers.sequence_pool(
input=squared_features_emb, pool_type='sum')
second_order = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
return first_order, second_order
sparse_fm_param_attr = fluid.param_attr.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(sparse_feature_dim)))
#data = fluid.layers.data(name='ids', shape=[1], dtype='float32')
sparse_fm_first, sparse_fm_second = sparse_fm_layer(
context_feature_fm, sparse_feature_dim, 16, sparse_fm_param_attr)
def embedding_layer(input):
return fluid.layers.embedding(
input=input,
is_sparse=True,
# you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
# if you want to set is_distributed to True
is_distributed=False,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
sparse_embed_seq = list(map(embedding_layer, context_feature))
concated_ori = fluid.layers.concat(sparse_embed_seq + [dense_feature], axis=1)
concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
deep = deep_net(concated)
predict = fluid.layers.fc(input=[deep, sparse_fm_first, sparse_fm_second, dense_fm_first, dense_fm_second], size=2, act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
#similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.reduce_sum(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
return avg_cost, auc_var, batch_auc_var, accuracy, predict
def deep_net(concated, lr_x=0.0001):
fc_layers_input = [concated]
fc_layers_size = [400, 400, 400]
fc_layers_act = ["relu"] * (len(fc_layers_size))
for i in range(len(fc_layers_size)):
fc = fluid.layers.fc(
input=fc_layers_input[-1],
size=fc_layers_size[i],
act=fc_layers_act[i],
param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
fc_layers_input.append(fc)
#w_res = fluid.layers.create_parameter(shape=[353, 16], dtype='float32', name="w_res")
#high_path = fluid.layers.matmul(concated, w_res)
#return fluid.layers.elementwise_add(high_path, fc_layers_input[-1])
return fc_layers_input[-1]
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import math
user_profile_dim = 65
num_context = 25
dim_fm_vector = 16
dim_concated = user_profile_dim + dim_fm_vector * (num_context)
def ctr_deepfm_dataset(user_profile, context_feature, label,
embedding_size, sparse_feature_dim):
def embedding_layer(input):
return fluid.layers.embedding(
input=input,
is_sparse=True,
# you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
# if you want to set is_distributed to True
is_distributed=False,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
sparse_embed_seq = list(map(embedding_layer, context_feature))
w = fluid.layers.create_parameter(
shape=[65, 65], dtype='float32',
name="w_fm")
user_profile_emb = fluid.layers.matmul(user_profile, w)
concated_ori = fluid.layers.concat(sparse_embed_seq + [user_profile_emb], axis=1)
concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
deep = deep_net(concated)
linear_term, second_term = fm(concated, dim_concated, 8) #depend on the number of context feature
predict = fluid.layers.fc(input=[deep, linear_term, second_term], size=2, act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
#similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.reduce_sum(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
return avg_cost, auc_var, batch_auc_var, accuracy, predict
def deep_net(concated, lr_x=0.0001):
fc_layers_input = [concated]
fc_layers_size = [128, 64, 32, 16]
fc_layers_act = ["relu"] * (len(fc_layers_size))
for i in range(len(fc_layers_size)):
fc = fluid.layers.fc(
input=fc_layers_input[-1],
size=fc_layers_size[i],
act=fc_layers_act[i],
param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
fc_layers_input.append(fc)
return fc_layers_input[-1]
def fm(concated, emb_dict_size, factor_size, lr_x=0.0001):
linear_term = fluid.layers.fc(input=concated, size=8, act=None, param_attr=fluid.ParamAttr(learning_rate=lr_x))
emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
dtype='float32')
input_mul_factor = fluid.layers.matmul(concated, emb_table)
input_mul_factor_square = fluid.layers.square(input_mul_factor)
input_square = fluid.layers.square(concated)
factor_square = fluid.layers.square(emb_table)
input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
second_term = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
return linear_term, second_term
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import math
user_profile_dim = 65
slot_1 = [0, 1, 2, 3, 4, 5]
slot_2 = [6]
slot_3 = [7, 8, 9, 10, 11]
slot_4 = [12, 13, 14, 15, 16]
slot_5 = [17, 18, 19, 20]
num_context = 25
num_slots_pair = 5
dim_fm_vector = 16
dim_concated = user_profile_dim + dim_fm_vector * (num_context + num_slots_pair)
def ctr_deepfm_dataset(user_profile, dense_feature, context_feature, label,
embedding_size, sparse_feature_dim):
def embedding_layer(input):
return fluid.layers.embedding(
input=input,
is_sparse=True,
# you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
# if you want to set is_distributed to True
is_distributed=False,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
sparse_embed_seq = list(map(embedding_layer, context_feature))
w = fluid.layers.create_parameter(
shape=[65, 65], dtype='float32',
name="w_fm")
user_emb_list = []
user_profile_emb = fluid.layers.matmul(user_profile, w)
user_emb_list.append(user_profile_emb)
user_emb_list.append(dense_feature)
w1 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_1")
w2 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_2")
w3 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_3")
w4 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_4")
w5 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_5")
user_profile_emb_1 = fluid.layers.matmul(user_profile, w1)
user_profile_emb_2 = fluid.layers.matmul(user_profile, w2)
user_profile_emb_3 = fluid.layers.matmul(user_profile, w3)
user_profile_emb_4 = fluid.layers.matmul(user_profile, w4)
user_profile_emb_5 = fluid.layers.matmul(user_profile, w5)
sparse_embed_seq_1 = embedding_layer(context_feature[slot_1[0]])
sparse_embed_seq_2 = embedding_layer(context_feature[slot_2[0]])
sparse_embed_seq_3 = embedding_layer(context_feature[slot_3[0]])
sparse_embed_seq_4 = embedding_layer(context_feature[slot_4[0]])
sparse_embed_seq_5 = embedding_layer(context_feature[slot_5[0]])
for i in slot_1[1:-1]:
sparse_embed_seq_1 = fluid.layers.elementwise_add(sparse_embed_seq_1, embedding_layer(context_feature[i]))
for i in slot_2[1:-1]:
sparse_embed_seq_2 = fluid.layers.elementwise_add(sparse_embed_seq_2, embedding_layer(context_feature[i]))
for i in slot_3[1:-1]:
sparse_embed_seq_3 = fluid.layers.elementwise_add(sparse_embed_seq_3, embedding_layer(context_feature[i]))
for i in slot_4[1:-1]:
sparse_embed_seq_4 = fluid.layers.elementwise_add(sparse_embed_seq_4, embedding_layer(context_feature[i]))
for i in slot_5[1:-1]:
sparse_embed_seq_5 = fluid.layers.elementwise_add(sparse_embed_seq_5, embedding_layer(context_feature[i]))
ele_product_1 = fluid.layers.elementwise_mul(user_profile_emb_1, sparse_embed_seq_1)
user_emb_list.append(ele_product_1)
ele_product_2 = fluid.layers.elementwise_mul(user_profile_emb_2, sparse_embed_seq_2)
user_emb_list.append(ele_product_2)
ele_product_3 = fluid.layers.elementwise_mul(user_profile_emb_3, sparse_embed_seq_3)
user_emb_list.append(ele_product_3)
ele_product_4 = fluid.layers.elementwise_mul(user_profile_emb_4, sparse_embed_seq_4)
user_emb_list.append(ele_product_4)
ele_product_5 = fluid.layers.elementwise_mul(user_profile_emb_5, sparse_embed_seq_5)
user_emb_list.append(ele_product_5)
ffm_1 = fluid.layers.reduce_sum(ele_product_1, dim=1, keep_dim=True)
ffm_2 = fluid.layers.reduce_sum(ele_product_2, dim=1, keep_dim=True)
ffm_3 = fluid.layers.reduce_sum(ele_product_3, dim=1, keep_dim=True)
ffm_4 = fluid.layers.reduce_sum(ele_product_4, dim=1, keep_dim=True)
ffm_5 = fluid.layers.reduce_sum(ele_product_5, dim=1, keep_dim=True)
concated_ori = fluid.layers.concat(sparse_embed_seq + user_emb_list, axis=1)
concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
deep = deep_net(concated)
linear_term, second_term = fm(concated, dim_concated, 8) #depend on the number of context feature
predict = fluid.layers.fc(input=[deep, linear_term, second_term, ffm_1, ffm_2, ffm_3, ffm_4, ffm_5], size=2, act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
#similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.reduce_sum(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
return avg_cost, auc_var, batch_auc_var, accuracy, predict
def deep_net(concated, lr_x=0.0001):
fc_layers_input = [concated]
fc_layers_size = [256, 128, 64, 32, 16]
fc_layers_act = ["relu"] * (len(fc_layers_size))
for i in range(len(fc_layers_size)):
fc = fluid.layers.fc(
input=fc_layers_input[-1],
size=fc_layers_size[i],
act=fc_layers_act[i],
param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
fc_layers_input.append(fc)
w_res = fluid.layers.create_parameter(shape=[dim_concated, 16], dtype='float32', name="w_res")
high_path = fluid.layers.matmul(concated, w_res)
return fluid.layers.elementwise_add(high_path, fc_layers_input[-1])
#return fc_layers_input[-1]
def fm(concated, emb_dict_size, factor_size, lr_x=0.0001):
linear_term = fluid.layers.fc(input=concated, size=8, act=None, param_attr=fluid.ParamAttr(learning_rate=lr_x))
emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
dtype='float32')
input_mul_factor = fluid.layers.matmul(concated, emb_table)
input_mul_factor_square = fluid.layers.square(input_mul_factor)
input_square = fluid.layers.square(concated)
factor_square = fluid.layers.square(emb_table)
input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
second_term = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
return linear_term, second_term
\ No newline at end of file
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import math
user_profile_dim = 65
dense_feature_dim = 3
def ctr_deepfm_dataset(dense_feature, context_feature, context_feature_fm, label,
embedding_size, sparse_feature_dim):
def dense_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
first_order = fluid.layers.fc(input=input, size=1)
emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
dtype='float32', attr=fm_param_attr)
input_mul_factor = fluid.layers.matmul(input, emb_table)
input_mul_factor_square = fluid.layers.square(input_mul_factor)
input_square = fluid.layers.square(input)
factor_square = fluid.layers.square(emb_table)
input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
second_order = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
return first_order, second_order
dense_fm_param_attr = fluid.param_attr.ParamAttr(name="DenseFeatFactors",
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(dense_feature_dim)))
dense_fm_first, dense_fm_second = dense_fm_layer(
dense_feature, dense_feature_dim, 16, dense_fm_param_attr)
def sparse_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
first_embeddings = fluid.layers.embedding(
input=input, dtype='float32', size=[emb_dict_size, 1], is_sparse=True)
first_order = fluid.layers.sequence_pool(input=first_embeddings, pool_type='sum')
nonzero_embeddings = fluid.layers.embedding(
input=input, dtype='float32', size=[emb_dict_size, factor_size],
param_attr=fm_param_attr, is_sparse=True)
summed_features_emb = fluid.layers.sequence_pool(input=nonzero_embeddings, pool_type='sum')
summed_features_emb_square = fluid.layers.square(summed_features_emb)
squared_features_emb = fluid.layers.square(nonzero_embeddings)
squared_sum_features_emb = fluid.layers.sequence_pool(
input=squared_features_emb, pool_type='sum')
second_order = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
return first_order, second_order
sparse_fm_param_attr = fluid.param_attr.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(sparse_feature_dim)))
#data = fluid.layers.data(name='ids', shape=[1], dtype='float32')
sparse_fm_first, sparse_fm_second = sparse_fm_layer(
context_feature_fm, sparse_feature_dim, 16, sparse_fm_param_attr)
def embedding_layer(input):
return fluid.layers.embedding(
input=input,
is_sparse=True,
# you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
# if you want to set is_distributed to True
is_distributed=False,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
sparse_embed_seq = list(map(embedding_layer, context_feature))
concated_ori = fluid.layers.concat(sparse_embed_seq + [dense_feature], axis=1)
concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
deep = deep_net(concated)
predict = fluid.layers.fc(input=[deep, sparse_fm_first, sparse_fm_second, dense_fm_first, dense_fm_second], size=2, act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
#similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.reduce_sum(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
return avg_cost, auc_var, batch_auc_var, accuracy, predict
def deep_net(concated, lr_x=0.0001):
fc_layers_input = [concated]
fc_layers_size = [400, 400, 400]
fc_layers_act = ["relu"] * (len(fc_layers_size))
for i in range(len(fc_layers_size)):
fc = fluid.layers.fc(
input=fc_layers_input[-1],
size=fc_layers_size[i],
act=fc_layers_act[i],
param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
fc_layers_input.append(fc)
#w_res = fluid.layers.create_parameter(shape=[353, 16], dtype='float32', name="w_res")
#high_path = fluid.layers.matmul(concated, w_res)
#return fluid.layers.elementwise_add(high_path, fc_layers_input[-1])
return fc_layers_input[-1]
\ No newline at end of file
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, sys, time, random, csv, datetime, json
import pandas as pd
import numpy as np
import argparse
import logging
import time
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("preprocess")
logger.setLevel(logging.INFO)
TEST_QUERIES_PATH = "./data_set_phase1/test_queries.csv"
TEST_PLANS_PATH = "./data_set_phase1/test_plans.csv"
TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
PROFILES_PATH = "./data_set_phase1/profiles.csv"
OUT_NORM_TEST_PATH = "./out/normed_test_session.txt"
OUT_RAW_TEST_PATH = "./out/test_session.txt"
O1_MIN = 115.47
O1_MAX = 117.29
O2_MIN = 39.46
O2_MAX = 40.97
D1_MIN = 115.44
D1_MAX = 117.37
D2_MIN = 39.46
D2_MAX = 40.96
SCALE_OD = 0.02
DISTANCE_MIN = 1.0
DISTANCE_MAX = 225864.0
THRESHOLD_DIS = 40000.0
SCALE_DIS = 500
PRICE_MIN = 200.0
PRICE_MAX = 92300.0
THRESHOLD_PRICE = 20000
SCALE_PRICE = 100
ETA_MIN = 1.0
ETA_MAX = 72992.0
THRESHOLD_ETA = 10800.0
SCALE_ETA = 120
def build_norm_feature():
with open(OUT_NORM_TEST_PATH, 'w') as nf:
with open(OUT_RAW_TEST_PATH, 'r') as f:
for line in f:
cur_map = json.loads(line)
if cur_map["plan"]["distance"] > THRESHOLD_DIS:
cur_map["plan"]["distance"] = int(THRESHOLD_DIS)
elif cur_map["plan"]["distance"] > 0:
cur_map["plan"]["distance"] = int(cur_map["plan"]["distance"] / SCALE_DIS)
if cur_map["plan"]["price"] and cur_map["plan"]["price"] > THRESHOLD_PRICE:
cur_map["plan"]["price"] = int(THRESHOLD_PRICE)
elif not cur_map["plan"]["price"] or cur_map["plan"]["price"] < 0:
cur_map["plan"]["price"] = 0
else:
cur_map["plan"]["price"] = int(cur_map["plan"]["price"] / SCALE_PRICE)
if cur_map["plan"]["eta"] > THRESHOLD_ETA:
cur_map["plan"]["eta"] = int(THRESHOLD_ETA)
elif cur_map["plan"]["eta"] > 0:
cur_map["plan"]["eta"] = int(cur_map["plan"]["eta"] / SCALE_ETA)
# o1
if cur_map["query"]["o1"] > O1_MAX:
cur_map["query"]["o1"] = int((O1_MAX - O1_MIN) / SCALE_OD + 1)
elif cur_map["query"]["o1"] < O1_MIN:
cur_map["query"]["o1"] = 0
else:
cur_map["query"]["o1"] = int((cur_map["query"]["o1"] - O1_MIN) / 0.02)
# o2
if cur_map["query"]["o2"] > O2_MAX:
cur_map["query"]["o2"] = int((O2_MAX - O2_MIN) / SCALE_OD + 1)
elif cur_map["query"]["o2"] < O2_MIN:
cur_map["query"]["o2"] = 0
else:
cur_map["query"]["o2"] = int((cur_map["query"]["o2"] - O2_MIN) / 0.02)
# d1
if cur_map["query"]["d1"] > D1_MAX:
cur_map["query"]["d1"] = int((D1_MAX - D1_MIN) / SCALE_OD + 1)
elif cur_map["query"]["d1"] < D1_MIN:
cur_map["query"]["d1"] = 0
else:
cur_map["query"]["d1"] = int((cur_map["query"]["d1"] - D1_MIN) / SCALE_OD)
# d2
if cur_map["query"]["d2"] > D2_MAX:
cur_map["query"]["d2"] = int((D2_MAX - D2_MIN) / SCALE_OD + 1)
elif cur_map["query"]["d2"] < D2_MIN:
cur_map["query"]["d2"] = 0
else:
cur_map["query"]["d2"] = int((cur_map["query"]["d2"] - D2_MIN) / SCALE_OD)
cur_json_instance = json.dumps(cur_map)
nf.write(cur_json_instance + '\n')
def preprocess():
"""
Construct the train data indexed by session id and mode id jointly. Convert some of the raw features (user profile,
od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
embedding. We split the one-hot features into two categories: user feature and context feature for
better understanding of FM algorithm.
Note that the user profile is already provided by one-hot encoded form, we convert it back to the
ids for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
not.
:return:
"""
train_data_dict = {}
with open("./weather.json", 'r') as f:
weather_dict = json.load(f)
with open(TEST_QUERIES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
train_index_list = []
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "": continue
if line[1] == "":
train_index_list.append(line[0] + "_0")
else:
train_index_list.append(line[0] + "_" + line[1])
train_index = line[0]
train_data_dict[train_index] = {}
train_data_dict[train_index]["pid"] = line[1]
train_data_dict[train_index]["query"] = {}
reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
date_key = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%m-%d")
train_data_dict[train_index]["weather"] = {}
train_data_dict[train_index]["weather"].update({"max_temp": weather_dict[date_key]["max_temp"]})
train_data_dict[train_index]["weather"].update({"min_temp": weather_dict[date_key]["min_temp"]})
train_data_dict[train_index]["weather"].update({"wea": weather_dict[date_key]["weather"]})
train_data_dict[train_index]["weather"].update({"wind": weather_dict[date_key]["wind"]})
train_data_dict[train_index]["query"].update({"weekday":reqweekday})
train_data_dict[train_index]["query"].update({"hour":reqhour})
o = line[3].split(',')
o_first = o[0]
o_second = o[1]
train_data_dict[train_index]["query"].update({"o1":float(o_first)})
train_data_dict[train_index]["query"].update({"o2":float(o_second)})
d = line[4].split(',')
d_first = d[0]
d_second = d[1]
train_data_dict[train_index]["query"].update({"d1":float(d_first)})
train_data_dict[train_index]["query"].update({"d2":float(d_second)})
plan_map = {}
plan_data = pd.read_csv(TEST_PLANS_PATH)
for index, row in plan_data.iterrows():
plans_str = row['plans']
plans_list = json.loads(plans_str)
session_id = str(row['sid'])
# train_data_dict[session_id]["plans"] = []
plan_map[session_id] = plans_list
profile_map = {}
with open(PROFILES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
session_click_map = {}
with open(TRAIN_CLICK_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "" or line[1] == "" or line[2] == "":
continue
session_click_map[line[0]] = line[2]
#return train_data_dict, profile_map, session_click_map, plan_map
generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
if not os.path.isdir("./out/"):
os.mkdir("./out/")
with open(os.path.join("./out/", "test_session.txt"), 'w') as f_train:
for session_id, plan_list in plan_map.items():
if session_id not in train_data_dict:
continue
cur_map = train_data_dict[session_id]
cur_map["session_id"] = session_id
if cur_map["pid"] != "":
cur_map["profile"] = profile_map[cur_map["pid"]]
else:
cur_map["profile"] = [0]
del cur_map["pid"]
whole_rank = 0
for plan in plan_list:
whole_rank += 1
cur_map["mode_rank" + str(whole_rank)] = plan["transport_mode"]
if whole_rank < 5:
for r in range(whole_rank + 1, 6):
cur_map["mode_rank" + str(r)] = -1
cur_map["whole_rank"] = whole_rank
flag_click = False
rank = 1
price_list = []
eta_list = []
distance_list = []
for plan in plan_list:
if not plan["price"]:
price_list.append(0)
else:
price_list.append(int(plan["price"]))
eta_list.append(int(plan["eta"]))
distance_list.append(int(plan["distance"]))
price_list.sort(reverse=False)
eta_list.sort(reverse=False)
distance_list.sort(reverse=False)
for plan in plan_list:
if plan["price"] and int(plan["price"]) == price_list[0]:
cur_map["mode_min_price"] = plan["transport_mode"]
if plan["price"] and int(plan["price"]) == price_list[-1]:
cur_map["mode_max_price"] = plan["transport_mode"]
if int(plan["eta"]) == eta_list[0]:
cur_map["mode_min_eta"] = plan["transport_mode"]
if int(plan["eta"]) == eta_list[-1]:
cur_map["mode_max_eta"] = plan["transport_mode"]
if int(plan["distance"]) == distance_list[0]:
cur_map["mode_min_distance"] = plan["transport_mode"]
if int(plan["distance"]) == distance_list[-1]:
cur_map["mode_max_distance"] = plan["transport_mode"]
if "mode_min_price" not in cur_map:
cur_map["mode_min_price"] = -1
if "mode_max_price" not in cur_map:
cur_map["mode_max_price"] = -1
for plan in plan_list:
cur_price = int(plan["price"]) if plan["price"] else 0
cur_eta = int(plan["eta"])
cur_distance = int(plan["distance"])
cur_map["price_rank"] = price_list.index(cur_price) + 1
cur_map["eta_rank"] = eta_list.index(cur_eta) + 1
cur_map["distance_rank"] = distance_list.index(cur_distance) + 1
if ("transport_mode" in plan) and (session_id in session_click_map) and (
int(plan["transport_mode"]) == int(session_click_map[session_id])):
cur_map["plan"] = plan
cur_map["label"] = 1
flag_click = True
# print("label is 1")
else:
cur_map["plan"] = plan
cur_map["label"] = 0
cur_map["plan_rank"] = rank
rank += 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
cur_map["plan"]["distance"] = -1
cur_map["plan"]["price"] = -1
cur_map["plan"]["eta"] = -1
cur_map["plan"]["transport_mode"] = 0
cur_map["plan_rank"] = 0
cur_map["price_rank"] = 0
cur_map["eta_rank"] = 0
cur_map["plan_rank"] = 0
cur_map["label"] = 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
build_norm_feature()
if __name__ == "__main__":
preprocess()
\ No newline at end of file
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, sys, time, random, csv, datetime, json
import pandas as pd
import numpy as np
import argparse
import logging
import time
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("preprocess")
logger.setLevel(logging.INFO)
TRAIN_QUERIES_PATH = "./data_set_phase1/test_queries.csv"
TRAIN_PLANS_PATH = "./data_set_phase1/test_plans.csv"
TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
PROFILES_PATH = "./data_set_phase1/profiles.csv"
O1_MIN = 115.47
O1_MAX = 117.29
O2_MIN = 39.46
O2_MAX = 40.97
D1_MIN = 115.44
D1_MAX = 117.37
D2_MIN = 39.46
D2_MAX = 40.96
DISTANCE_MIN = 1.0
DISTANCE_MAX = 225864.0
THRESHOLD_DIS = 200000.0
PRICE_MIN = 200.0
PRICE_MAX = 92300.0
THRESHOLD_PRICE = 20000
ETA_MIN = 1.0
ETA_MAX = 72992.0
THRESHOLD_ETA = 10800.0
def build_norm_feature():
with open("./out/normed_test_session.txt", 'w') as nf:
with open("./out/test_session.txt", 'r') as f:
for line in f:
cur_map = json.loads(line)
cur_map["plan"]["distance"] = (cur_map["plan"]["distance"] - DISTANCE_MIN) / (DISTANCE_MAX - DISTANCE_MIN)
if cur_map["plan"]["price"]:
cur_map["plan"]["price"] = (cur_map["plan"]["price"] - PRICE_MIN) / (PRICE_MAX - PRICE_MIN)
else:
cur_map["plan"]["price"] = 0.0
cur_map["plan"]["eta"] = (cur_map["plan"]["eta"] - ETA_MIN) / (ETA_MAX - ETA_MIN)
cur_json_instance = json.dumps(cur_map)
nf.write(cur_json_instance + '\n')
def preprocess():
"""
Construct the train data indexed by session id and mode id jointly. Convert all the raw features (user profile,
od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
embedding. We split the one-hot features into two categories: user feature and context feature for
better understanding of FFM algorithm.
Note that the user profile is already provided by one-hot encoded form, we convert it back to the
ids for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
not.
:return:
"""
#args = parse_args()
train_data_dict = {}
with open("./weather.json", 'r') as f:
weather_dict = json.load(f)
with open(TRAIN_QUERIES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
train_index_list = []
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "": continue
if line[1] == "":
train_index_list.append(line[0] + "_0")
else:
train_index_list.append(line[0] + "_" + line[1])
train_index = line[0]
train_data_dict[train_index] = {}
train_data_dict[train_index]["pid"] = line[1]
train_data_dict[train_index]["query"] = {}
reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
date_key = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%m-%d")
train_data_dict[train_index]["weather"] = {}
train_data_dict[train_index]["weather"].update({"max_temp": weather_dict[date_key]["max_temp"]})
train_data_dict[train_index]["weather"].update({"min_temp": weather_dict[date_key]["min_temp"]})
train_data_dict[train_index]["weather"].update({"wea": weather_dict[date_key]["weather"]})
train_data_dict[train_index]["weather"].update({"wind": weather_dict[date_key]["wind"]})
train_data_dict[train_index]["query"].update({"weekday":reqweekday})
train_data_dict[train_index]["query"].update({"hour":reqhour})
o = line[3].split(',')
o_first = o[0]
o_second = o[1]
train_data_dict[train_index]["query"].update({"o1":float(o_first)})
train_data_dict[train_index]["query"].update({"o2":float(o_second)})
d = line[4].split(',')
d_first = d[0]
d_second = d[1]
train_data_dict[train_index]["query"].update({"d1":float(d_first)})
train_data_dict[train_index]["query"].update({"d2":float(d_second)})
plan_map = {}
plan_data = pd.read_csv(TRAIN_PLANS_PATH)
for index, row in plan_data.iterrows():
plans_str = row['plans']
plans_list = json.loads(plans_str)
session_id = str(row['sid'])
# train_data_dict[session_id]["plans"] = []
plan_map[session_id] = plans_list
profile_map = {}
with open(PROFILES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
session_click_map = {}
with open(TRAIN_CLICK_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "" or line[1] == "" or line[2] == "":
continue
session_click_map[line[0]] = line[2]
#return train_data_dict, profile_map, session_click_map, plan_map
generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
if not os.path.isdir("./out/"):
os.mkdir("./out/")
with open(os.path.join("./out/", "test_session.txt"), 'w') as f_train:
for session_id, plan_list in plan_map.items():
if session_id not in train_data_dict:
continue
cur_map = train_data_dict[session_id]
cur_map["session_id"] = session_id
if cur_map["pid"] != "":
cur_map["profile"] = profile_map[cur_map["pid"]]
else:
cur_map["profile"] = [0]
# del cur_map["pid"]
whole_rank = 0
for plan in plan_list:
whole_rank += 1
cur_map["mode_rank" + str(whole_rank)] = plan["transport_mode"]
if whole_rank < 5:
for r in range(whole_rank + 1, 6):
cur_map["mode_rank" + str(r)] = -1
cur_map["whole_rank"] = whole_rank
rank = 1
price_list = []
eta_list = []
distance_list = []
for plan in plan_list:
if not plan["price"]:
price_list.append(0)
else:
price_list.append(int(plan["price"]))
eta_list.append(int(plan["eta"]))
distance_list.append(int(plan["distance"]))
price_list.sort(reverse=False)
eta_list.sort(reverse=False)
distance_list.sort(reverse=False)
for plan in plan_list:
if plan["price"] and int(plan["price"]) == price_list[0]:
cur_map["mode_min_price"] = plan["transport_mode"]
if plan["price"] and int(plan["price"]) == price_list[-1]:
cur_map["mode_max_price"] = plan["transport_mode"]
if int(plan["eta"]) == eta_list[0]:
cur_map["mode_min_eta"] = plan["transport_mode"]
if int(plan["eta"]) == eta_list[-1]:
cur_map["mode_max_eta"] = plan["transport_mode"]
if int(plan["distance"]) == distance_list[0]:
cur_map["mode_min_distance"] = plan["transport_mode"]
if int(plan["distance"]) == distance_list[-1]:
cur_map["mode_max_distance"] = plan["transport_mode"]
if "mode_min_price" not in cur_map:
cur_map["mode_min_price"] = -1
if "mode_max_price" not in cur_map:
cur_map["mode_max_price"] = -1
for plan in plan_list:
cur_price = int(plan["price"]) if plan["price"] else 0
cur_eta = int(plan["eta"])
cur_distance = int(plan["distance"])
cur_map["price_rank"] = price_list.index(cur_price) + 1
cur_map["eta_rank"] = eta_list.index(cur_eta) + 1
cur_map["distance_rank"] = distance_list.index(cur_distance) + 1
if ("transport_mode" in plan) and (session_id in session_click_map) and (
int(plan["transport_mode"]) == int(session_click_map[session_id])):
cur_map["plan"] = plan
cur_map["label"] = 1
else:
cur_map["plan"] = plan
cur_map["label"] = 0
cur_map["plan_rank"] = rank
rank += 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
cur_map["plan"]["distance"] = -1
cur_map["plan"]["price"] = -1
cur_map["plan"]["eta"] = -1
cur_map["plan"]["transport_mode"] = 0
cur_map["plan_rank"] = 0
cur_map["price_rank"] = 0
cur_map["eta_rank"] = 0
cur_map["plan_rank"] = 0
cur_map["label"] = 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
build_norm_feature()
if __name__ == "__main__":
preprocess()
\ No newline at end of file
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, sys, time, random, csv, datetime, json
import pandas as pd
import numpy as np
import argparse
import logging
import time
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("preprocess")
logger.setLevel(logging.INFO)
TRAIN_QUERIES_PATH = "./data_set_phase1/train_queries.csv"
TRAIN_PLANS_PATH = "./data_set_phase1/train_plans.csv"
TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
PROFILES_PATH = "./data_set_phase1/profiles.csv"
OUT_NORM_TRAIN_PATH = "./out/normed_train.txt"
OUT_RAW_TRAIN_PATH = "./out/train.txt"
OUT_DIR = "./out"
O1_MIN = 115.47
O1_MAX = 117.29
O2_MIN = 39.46
O2_MAX = 40.97
D1_MIN = 115.44
D1_MAX = 117.37
D2_MIN = 39.46
D2_MAX = 40.96
SCALE_OD = 0.02
DISTANCE_MIN = 1.0
DISTANCE_MAX = 225864.0
THRESHOLD_DIS = 40000.0
SCALE_DIS = 500
PRICE_MIN = 200.0
PRICE_MAX = 92300.0
THRESHOLD_PRICE = 20000
SCALE_PRICE = 100
ETA_MIN = 1.0
ETA_MAX = 72992.0
THRESHOLD_ETA = 10800.0
SCALE_ETA = 120
def build_norm_feature():
with open(OUT_NORM_TRAIN_PATH, 'w') as nf:
with open(OUT_RAW_TRAIN_PATH, 'r') as f:
for line in f:
cur_map = json.loads(line)
if cur_map["plan"]["distance"] > THRESHOLD_DIS:
cur_map["plan"]["distance"] = int(THRESHOLD_DIS)
elif cur_map["plan"]["distance"] > 0:
cur_map["plan"]["distance"] = int(cur_map["plan"]["distance"] / SCALE_DIS)
if cur_map["plan"]["price"] and cur_map["plan"]["price"] > THRESHOLD_PRICE:
cur_map["plan"]["price"] = int(THRESHOLD_PRICE)
elif not cur_map["plan"]["price"] or cur_map["plan"]["price"] < 0:
cur_map["plan"]["price"] = 0
else:
cur_map["plan"]["price"] = int(cur_map["plan"]["price"] / SCALE_PRICE)
if cur_map["plan"]["eta"] > THRESHOLD_ETA:
cur_map["plan"]["eta"] = int(THRESHOLD_ETA)
elif cur_map["plan"]["eta"] > 0:
cur_map["plan"]["eta"] = int(cur_map["plan"]["eta"] / SCALE_ETA)
# o1
if cur_map["query"]["o1"] > O1_MAX:
cur_map["query"]["o1"] = int((O1_MAX - O1_MIN) / SCALE_OD + 1)
elif cur_map["query"]["o1"] < O1_MIN:
cur_map["query"]["o1"] = 0
else:
cur_map["query"]["o1"] = int((cur_map["query"]["o1"] - O1_MIN) / 0.02)
# o2
if cur_map["query"]["o2"] > O2_MAX:
cur_map["query"]["o2"] = int((O2_MAX - O2_MIN) / SCALE_OD + 1)
elif cur_map["query"]["o2"] < O2_MIN:
cur_map["query"]["o2"] = 0
else:
cur_map["query"]["o2"] = int((cur_map["query"]["o2"] - O2_MIN) / 0.02)
# d1
if cur_map["query"]["d1"] > D1_MAX:
cur_map["query"]["d1"] = int((D1_MAX - D1_MIN) / SCALE_OD + 1)
elif cur_map["query"]["d1"] < D1_MIN:
cur_map["query"]["d1"] = 0
else:
cur_map["query"]["d1"] = int((cur_map["query"]["d1"] - D1_MIN) / SCALE_OD)
# d2
if cur_map["query"]["d2"] > D2_MAX:
cur_map["query"]["d2"] = int((D2_MAX - D2_MIN) / SCALE_OD + 1)
elif cur_map["query"]["d2"] < D2_MIN:
cur_map["query"]["d2"] = 0
else:
cur_map["query"]["d2"] = int((cur_map["query"]["d2"] - D2_MIN) / SCALE_OD)
cur_json_instance = json.dumps(cur_map)
nf.write(cur_json_instance + '\n')
def preprocess():
"""
Construct the train data indexed by session id and mode id jointly. Convert all the raw features (user profile,
od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
embedding. We split the one-hot features into two categories: user feature and context feature for
better understanding of FM algorithm.
Note that the user profile is already provided by one-hot encoded form, we treat it as embedded vector
for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
not include non-click case.
:return:
"""
train_data_dict = {}
with open(TRAIN_QUERIES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
train_index_list = []
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "": continue
if line[1] == "":
train_index_list.append(line[0] + "_0")
else:
train_index_list.append(line[0] + "_" + line[1])
train_index = line[0]
train_data_dict[train_index] = {}
train_data_dict[train_index]["pid"] = line[1]
train_data_dict[train_index]["query"] = {}
reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
train_data_dict[train_index]["query"].update({"weekday":reqweekday})
train_data_dict[train_index]["query"].update({"hour":reqhour})
o = line[3].split(',')
o_first = o[0]
o_second = o[1]
train_data_dict[train_index]["query"].update({"o1":float(o_first)})
train_data_dict[train_index]["query"].update({"o2":float(o_second)})
d = line[4].split(',')
d_first = d[0]
d_second = d[1]
train_data_dict[train_index]["query"].update({"d1":float(d_first)})
train_data_dict[train_index]["query"].update({"d2":float(d_second)})
plan_map = {}
plan_data = pd.read_csv(TRAIN_PLANS_PATH)
for index, row in plan_data.iterrows():
plans_str = row['plans']
plans_list = json.loads(plans_str)
session_id = str(row['sid'])
# train_data_dict[session_id]["plans"] = []
plan_map[session_id] = plans_list
profile_map = {}
with open(PROFILES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
session_click_map = {}
with open(TRAIN_CLICK_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "" or line[1] == "" or line[2] == "":
continue
session_click_map[line[0]] = line[2]
#return train_data_dict, profile_map, session_click_map, plan_map
generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
if not os.path.isdir(OUT_DIR):
os.mkdir(OUT_DIR)
with open(os.path.join("./out/", "train.txt"), 'w') as f_train:
for session_id, plan_list in plan_map.items():
if session_id not in train_data_dict:
continue
cur_map = train_data_dict[session_id]
if cur_map["pid"] != "":
cur_map["profile"] = profile_map[cur_map["pid"]]
else:
cur_map["profile"] = [0]
del cur_map["pid"]
whole_rank = 0
for plan in plan_list:
whole_rank += 1
cur_map["whole_rank"] = whole_rank
flag_click = False
rank = 1
for plan in plan_list:
if ("transport_mode" in plan) and (session_id in session_click_map) and (
int(plan["transport_mode"]) == int(session_click_map[session_id])):
cur_map["plan"] = plan
cur_map["label"] = 1
flag_click = True
# print("label is 1")
else:
cur_map["plan"] = plan
cur_map["label"] = 0
cur_map["rank"] = rank
rank += 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
if not flag_click:
cur_map["plan"]["distance"] = -1
cur_map["plan"]["price"] = -1
cur_map["plan"]["eta"] = -1
cur_map["plan"]["transport_mode"] = 0
cur_map["rank"] = 0
cur_map["label"] = 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
else:
cur_map["plan"]["distance"] = -1
cur_map["plan"]["price"] = -1
cur_map["plan"]["eta"] = -1
cur_map["plan"]["transport_mode"] = 0
cur_map["rank"] = 0
cur_map["label"] = 0
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
build_norm_feature()
if __name__ == "__main__":
preprocess()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, random, csv, datetime, json
import pandas as pd
import numpy as np
import argparse
import logging
import time
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("preprocess")
logger.setLevel(logging.INFO)
TRAIN_QUERIES_PATH = "./data_set_phase1/train_queries.csv"
TRAIN_PLANS_PATH = "./data_set_phase1/train_plans.csv"
TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
PROFILES_PATH = "./data_set_phase1/profiles.csv"
OUT_DIR = "./out"
ORI_TRAIN_PATH = "train.txt"
NORM_TRAIN_PATH = "normed_train.txt"
#variable to control the ratio of positive and negative instance of transmode 0 which is original label of no click
THRESHOLD_LABEL = 0.5
O1_MIN = 115.47
O1_MAX = 117.29
O2_MIN = 39.46
O2_MAX = 40.97
D1_MIN = 115.44
D1_MAX = 117.37
D2_MIN = 39.46
D2_MAX = 40.96
DISTANCE_MIN = 1.0
DISTANCE_MAX = 225864.0
THRESHOLD_DIS = 200000.0
PRICE_MIN = 200.0
PRICE_MAX = 92300.0
THRESHOLD_PRICE = 20000
ETA_MIN = 1.0
ETA_MAX = 72992.0
THRESHOLD_ETA = 10800.0
def build_norm_feature():
with open(os.path.join(OUT_DIR, NORM_TRAIN_PATH), 'w') as nf:
with open(os.path.join(OUT_DIR, ORI_TRAIN_PATH), 'r') as f:
for line in f:
cur_map = json.loads(line)
cur_map["plan"]["distance"] = (cur_map["plan"]["distance"] - DISTANCE_MIN) / (DISTANCE_MAX - DISTANCE_MIN)
if cur_map["plan"]["price"]:
cur_map["plan"]["price"] = (cur_map["plan"]["price"] - PRICE_MIN) / (PRICE_MAX - PRICE_MIN)
else:
cur_map["plan"]["price"] = 0.0
cur_map["plan"]["eta"] = (cur_map["plan"]["eta"] - ETA_MIN) / (ETA_MAX - ETA_MIN)
cur_json_instance = json.dumps(cur_map)
nf.write(cur_json_instance + '\n')
def preprocess():
"""
Construct the train data indexed by session id and mode id jointly. Convert all the raw features (user profile,
od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
embedding. We split the one-hot features into two categories: user feature and context feature for
better understanding of FM algorithm.
Note that the user profile is already provided by one-hot encoded form, we treat it as embedded vector
for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
not include non-click case. To Be Changed
:return:
"""
train_data_dict = {}
with open("./weather.json", 'r') as f:
weather_dict = json.load(f)
with open(TRAIN_QUERIES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
train_index_list = []
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "": continue
if line[1] == "":
train_index_list.append(line[0] + "_0")
else:
train_index_list.append(line[0] + "_" + line[1])
train_index = line[0]
train_data_dict[train_index] = {}
train_data_dict[train_index]["pid"] = line[1]
train_data_dict[train_index]["query"] = {}
train_data_dict[train_index]["weather"] = {}
reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
# weather related features, no big use, maybe more detailed weather information is better
date_key = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%m-%d")
train_data_dict[train_index]["weather"] = {}
train_data_dict[train_index]["weather"].update({"max_temp": weather_dict[date_key]["max_temp"]})
train_data_dict[train_index]["weather"].update({"min_temp": weather_dict[date_key]["min_temp"]})
train_data_dict[train_index]["weather"].update({"wea": weather_dict[date_key]["weather"]})
train_data_dict[train_index]["weather"].update({"wind": weather_dict[date_key]["wind"]})
train_data_dict[train_index]["query"].update({"weekday":reqweekday})
train_data_dict[train_index]["query"].update({"hour":reqhour})
o = line[3].split(',')
o_first = o[0]
o_second = o[1]
train_data_dict[train_index]["query"].update({"o1":float(o_first)})
train_data_dict[train_index]["query"].update({"o2":float(o_second)})
d = line[4].split(',')
d_first = d[0]
d_second = d[1]
train_data_dict[train_index]["query"].update({"d1":float(d_first)})
train_data_dict[train_index]["query"].update({"d2":float(d_second)})
plan_map = {}
plan_data = pd.read_csv(TRAIN_PLANS_PATH)
for index, row in plan_data.iterrows():
plans_str = row['plans']
plans_list = json.loads(plans_str)
session_id = str(row['sid'])
# train_data_dict[session_id]["plans"] = []
plan_map[session_id] = plans_list
profile_map = {}
with open(PROFILES_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
session_click_map = {}
with open(TRAIN_CLICK_PATH, 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for k, line in enumerate(csv_reader):
if k == 0: continue
if line[0] == "" or line[1] == "" or line[2] == "":
continue
session_click_map[line[0]] = line[2]
#return train_data_dict, profile_map, session_click_map, plan_map
generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
if not os.path.isdir(OUT_DIR):
os.mkdir(OUT_DIR)
with open(os.path.join(OUT_DIR, ORI_TRAIN_PATH), 'w') as f_train:
for session_id, plan_list in plan_map.items():
if session_id not in train_data_dict:
continue
cur_map = train_data_dict[session_id]
if cur_map["pid"] != "":
cur_map["profile"] = profile_map[cur_map["pid"]]
else:
cur_map["profile"] = [0]
#rank information related feature
whole_rank = 0
for plan in plan_list:
whole_rank += 1
cur_map["mode_rank" + str(whole_rank)] = plan["transport_mode"]
if whole_rank < 5:
for r in range(whole_rank + 1, 6):
cur_map["mode_rank" + str(r)] = -1
cur_map["whole_rank"] = whole_rank
flag_click = False
rank = 1
price_list = []
eta_list = []
distance_list = []
for plan in plan_list:
if not plan["price"]:
price_list.append(0)
else:
price_list.append(int(plan["price"]))
eta_list.append(int(plan["eta"]))
distance_list.append(int(plan["distance"]))
price_list.sort(reverse=False)
eta_list.sort(reverse=False)
distance_list.sort(reverse=False)
for plan in plan_list:
if plan["price"] and int(plan["price"]) == price_list[0]:
cur_map["mode_min_price"] = plan["transport_mode"]
if plan["price"] and int(plan["price"]) == price_list[-1]:
cur_map["mode_max_price"] = plan["transport_mode"]
if int(plan["eta"]) == eta_list[0]:
cur_map["mode_min_eta"] = plan["transport_mode"]
if int(plan["eta"]) == eta_list[-1]:
cur_map["mode_max_eta"] = plan["transport_mode"]
if int(plan["distance"]) == distance_list[0]:
cur_map["mode_min_distance"] = plan["transport_mode"]
if int(plan["distance"]) == distance_list[-1]:
cur_map["mode_max_distance"] = plan["transport_mode"]
if "mode_min_price" not in cur_map:
cur_map["mode_min_price"] = -1
if "mode_max_price" not in cur_map:
cur_map["mode_max_price"] = -1
for plan in plan_list:
if ("transport_mode" in plan) and (session_id in session_click_map) and (
int(plan["transport_mode"]) == int(session_click_map[session_id])):
flag_click = True
if flag_click:
for plan in plan_list:
cur_price = int(plan["price"]) if plan["price"] else 0
cur_eta = int(plan["eta"])
cur_distance = int(plan["distance"])
cur_map["price_rank"] = price_list.index(cur_price) + 1
cur_map["eta_rank"] = eta_list.index(cur_eta) + 1
cur_map["distance_rank"] = distance_list.index(cur_distance) + 1
if ("transport_mode" in plan) and (session_id in session_click_map) and (
int(plan["transport_mode"]) == int(session_click_map[session_id])):
cur_map["plan"] = plan
cur_map["label"] = 1
else:
cur_map["plan"] = plan
cur_map["label"] = 0
cur_map["plan_rank"] = rank
rank += 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
cur_map["plan"] = {}
#since we define a new ctr task from original task, we use a basic way to generate instances of transport mode 0.
#There should be a optimal strategy to generate instances of transport mode 0
if not flag_click:
cur_map["plan"]["distance"] = -1
cur_map["plan"]["price"] = -1
cur_map["plan"]["eta"] = -1
cur_map["plan"]["transport_mode"] = 0
cur_map["plan_rank"] = 0
cur_map["price_rank"] = 0
cur_map["eta_rank"] = 0
cur_map["distance_rank"] = 0
cur_map["label"] = 1
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
else:
if random.random() < THRESHOLD_LABEL:
cur_map["plan"]["distance"] = -1
cur_map["plan"]["price"] = -1
cur_map["plan"]["eta"] = -1
cur_map["plan"]["transport_mode"] = 0
cur_map["plan_rank"] = 0
cur_map["price_rank"] = 0
cur_map["eta_rank"] = 0
cur_map["distance_rank"] = 0
cur_map["label"] = 0
cur_json_instance = json.dumps(cur_map)
f_train.write(cur_json_instance + '\n')
build_norm_feature()
if __name__ == "__main__":
preprocess()
{"10-01": {"max_temp": "24", "min_temp": "12", "weather": "q", "wind": "45"}, "10-02": {"max_temp": "24", "min_temp": "11", "weather": "q", "wind": "12"}, "10-03": {"max_temp": "25", "min_temp": "10", "weather": "q", "wind": "12"}, "10-04": {"max_temp": "25", "min_temp": "12", "weather": "q", "wind": "12"}, "10-05": {"max_temp": "24", "min_temp": "14", "weather": "dy", "wind": "12"}, "10-06": {"max_temp": "20", "min_temp": "8", "weather": "q", "wind": "45"}, "10-07": {"max_temp": "21", "min_temp": "7", "weather": "q", "wind": "12"}, "10-08": {"max_temp": "21", "min_temp": "8", "weather": "dy", "wind": "12"}, "10-09": {"max_temp": "15", "min_temp": "4", "weather": "dyq", "wind": "45"}, "10-10": {"max_temp": "17", "min_temp": "4", "weather": "dyq", "wind": "12"}, "10-11": {"max_temp": "18", "min_temp": "5", "weather": "qdy", "wind": "12"}, "10-12": {"max_temp": "20", "min_temp": "5", "weather": "dyq", "wind": "12"}, "10-13": {"max_temp": "20", "min_temp": "8", "weather": "dy", "wind": "12"}, "10-14": {"max_temp": "21", "min_temp": "10", "weather": "dy", "wind": "12"}, "10-15": {"max_temp": "17", "min_temp": "11", "weather": "xq", "wind": "12"}, "10-16": {"max_temp": "17", "min_temp": "7", "weather": "dyq", "wind": "12"}, "10-17": {"max_temp": "17", "min_temp": "5", "weather": "q", "wind": "12"}, "10-18": {"max_temp": "18", "min_temp": "5", "weather": "q", "wind": "12"}, "10-19": {"max_temp": "19", "min_temp": "7", "weather": "dy", "wind": "12"}, "10-20": {"max_temp": "18", "min_temp": "7", "weather": "dy", "wind": "12"}, "10-21": {"max_temp": "18", "min_temp": "7", "weather": "dy", "wind": "12"}, "10-22": {"max_temp": "19", "min_temp": "5", "weather": "dyq", "wind": "12"}, "10-23": {"max_temp": "19", "min_temp": "4", "weather": "q", "wind": "34"}, "10-24": {"max_temp": "20", "min_temp": "6", "weather": "qdy", "wind": "12"}, "10-25": {"max_temp": "15", "min_temp": "8", "weather": "dy", "wind": "12"}, "10-26": {"max_temp": "14", "min_temp": "3", "weather": "q", "wind": "45"}, "10-27": {"max_temp": "17", "min_temp": "5", "weather": "dy", "wind": "12"}, "10-28": {"max_temp": "17", "min_temp": "4", "weather": "dyq", "wind": "45"}, "10-29": {"max_temp": "15", "min_temp": "3", "weather": "q", "wind": "34"}, "10-30": {"max_temp": "16", "min_temp": "1", "weather": "q", "wind": "12"}, "10-31": {"max_temp": "17", "min_temp": "3", "weather": "q", "wind": "12"}, "11-01": {"max_temp": "17", "min_temp": "3", "weather": "q", "wind": "12"}, "11-02": {"max_temp": "18", "min_temp": "4", "weather": "q", "wind": "12"}, "11-03": {"max_temp": "16", "min_temp": "6", "weather": "dy", "wind": "12"}, "11-04": {"max_temp": "10", "min_temp": "2", "weather": "xydy", "wind": "34"}, "11-05": {"max_temp": "10", "min_temp": "2", "weather": "dy", "wind": "12"}, "11-06": {"max_temp": "12", "min_temp": "0", "weather": "dy", "wind": "12"}, "11-07": {"max_temp": "13", "min_temp": "3", "weather": "dy", "wind": "12"}, "11-08": {"max_temp": "14", "min_temp": "2", "weather": "dy", "wind": "12"}, "11-09": {"max_temp": "15", "min_temp": "1", "weather": "qdy", "wind": "34"}, "11-10": {"max_temp": "11", "min_temp": "0", "weather": "dy", "wind": "12"}, "11-11": {"max_temp": "13", "min_temp": "1", "weather": "dyq", "wind": "12"}, "11-12": {"max_temp": "14", "min_temp": "2", "weather": "q", "wind": "12"}, "11-13": {"max_temp": "13", "min_temp": "5", "weather": "dy", "wind": "12"}, "11-14": {"max_temp": "13", "min_temp": "5", "weather": "dy", "wind": "12"}, "11-15": {"max_temp": "8", "min_temp": "1", "weather": "xydy", "wind": "34"}, "11-16": {"max_temp": "8", "min_temp": "-1", "weather": "q", "wind": "12"}, "11-17": {"max_temp": "9", "min_temp": "-2", "weather": "dyq", "wind": "12"}, "11-18": {"max_temp": "11", "min_temp": "-3", "weather": "q", "wind": "34"}, "11-19": {"max_temp": "10", "min_temp": "-2", "weather": "qdy", "wind": "12"}, "11-20": {"max_temp": "9", "min_temp": "-1", "weather": "dy", "wind": "12"}, "11-21": {"max_temp": "9", "min_temp": "-3", "weather": "q", "wind": "2"}, "11-22": {"max_temp": "8", "min_temp": "-3", "weather": "qdy", "wind": "1"}, "11-23": {"max_temp": "7", "min_temp": "0", "weather": "dy", "wind": "2"}, "11-24": {"max_temp": "9", "min_temp": "-3", "weather": "qdy", "wind": "2"}, "11-25": {"max_temp": "10", "min_temp": "-3", "weather": "q", "wind": "1"}, "11-26": {"max_temp": "10", "min_temp": "0", "weather": "dy", "wind": "1"}, "11-27": {"max_temp": "9", "min_temp": "-3", "weather": "qdy", "wind": "2"}, "11-28": {"max_temp": "8", "min_temp": "-3", "weather": "q", "wind": "1"}, "11-29": {"max_temp": "7", "min_temp": "-4", "weather": "q", "wind": "1"}, "11-30": {"max_temp": "8", "min_temp": "-3", "weather": "q", "wind": "1"}, "12-01": {"max_temp": "7", "min_temp": "0", "weather": "dy", "wind": "1"}, "12-02": {"max_temp": "9", "min_temp": "2", "weather": "dy", "wind": "1"}, "12-03": {"max_temp": "8", "min_temp": "-3", "weather": "dyq", "wind": "3"}, "12-04": {"max_temp": "4", "min_temp": "-6", "weather": "qdy", "wind": "2"}, "12-05": {"max_temp": "1", "min_temp": "-4", "weather": "dy", "wind": "1"}, "12-06": {"max_temp": "-2", "min_temp": "-9", "weather": "q", "wind": "3"}, "12-07": {"max_temp": "-4", "min_temp": "-10", "weather": "q", "wind": "3"}, "12-08": {"max_temp": "-2", "min_temp": "-10", "weather": "qdy", "wind": "2"}, "12-09": {"max_temp": "-1", "min_temp": "-10", "weather": "dyq", "wind": "1"}}
\ No newline at end of file
import argparse
import os
import sys
import time
from collections import OrderedDict
import paddle.fluid as fluid
from network import DCN
import utils
def boolean_string(s):
if s.lower() not in {'false', 'true'}:
raise ValueError('Not a valid boolean string')
return s.lower() == 'true'
def parse_args():
parser = argparse.ArgumentParser("dcn cluster train.")
parser.add_argument(
'--train_data_dir',
type=str,
default='dist_data/dist_train_data',
help='The path of train data')
parser.add_argument(
'--test_valid_data_dir',
type=str,
default='dist_data/dist_test_valid_data',
help='The path of test and valid data')
parser.add_argument(
'--vocab_dir',
type=str,
default='dist_data/vocab',
help='The path of generated vocabs')
parser.add_argument(
'--cat_feat_num',
type=str,
default='dist_data/cat_feature_num.txt',
help='The path of generated cat_feature_num.txt')
parser.add_argument(
'--batch_size', type=int, default=512, help="Batch size")
parser.add_argument('--num_epoch', type=int, default=10, help="train epoch")
parser.add_argument(
'--model_output_dir',
type=str,
default='models',
help='The path for model to store')
parser.add_argument(
'--num_thread', type=int, default=1, help='The number of threads')
parser.add_argument('--test_epoch', type=str, default='1')
parser.add_argument(
'--dnn_hidden_units',
nargs='+',
type=int,
default=[1024, 1024],
help='DNN layers and hidden units')
parser.add_argument(
'--cross_num',
type=int,
default=6,
help='The number of Cross network layers')
parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate')
parser.add_argument(
'--l2_reg_cross',
type=float,
default=1e-5,
help='Cross net l2 regularizer coefficient')
parser.add_argument(
'--use_bn',
type=boolean_string,
default=True,
help='Whether use batch norm in dnn part')
parser.add_argument(
'--is_sparse',
action='store_true',
required=False,
default=False,
help='embedding will use sparse or not, (default: False)')
parser.add_argument(
'--clip_by_norm', type=float, default=100.0, help="gradient clip norm")
parser.add_argument('--print_steps', type=int, default=5)
parser.add_argument('--use_gpu', type=int, default=1)
# dist params
parser.add_argument('--is_local', type=int, default=1, help='whether local')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def train():
""" do training """
args = parse_args()
print(args)
if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
cat_feat_dims_dict = OrderedDict()
for line in open(args.cat_feat_num):
spls = line.strip().split()
assert len(spls) == 2
cat_feat_dims_dict[spls[0]] = int(spls[1])
dcn_model = DCN(args.cross_num, args.dnn_hidden_units, args.l2_reg_cross,
args.use_bn, args.clip_by_norm, cat_feat_dims_dict,
args.is_sparse)
dcn_model.build_network()
optimizer = fluid.optimizer.Adam(learning_rate=args.lr)
optimizer.minimize(dcn_model.loss)
def train_loop(main_program):
""" train network """
start_time = time.time()
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(dcn_model.data_list)
pipe_command = 'python reader.py {}'.format(args.vocab_dir)
dataset.set_pipe_command(pipe_command)
dataset.set_batch_size(args.batch_size)
dataset.set_thread(args.num_thread)
train_filelist = [
os.path.join(args.train_data_dir, fname)
for fname in next(os.walk(args.train_data_dir))[2]
]
dataset.set_filelist(train_filelist)
if args.use_gpu == 1:
exe = fluid.Executor(fluid.CUDAPlace(0))
dataset.set_thread(1)
else:
exe = fluid.Executor(fluid.CPUPlace())
dataset.set_thread(args.num_thread)
exe.run(fluid.default_startup_program())
for epoch_id in range(args.num_epoch):
start = time.time()
sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
exe.train_from_dataset(
program=main_program,
dataset=dataset,
fetch_list=[
dcn_model.loss, dcn_model.avg_logloss, dcn_model.auc_var
],
fetch_info=['total_loss', 'avg_logloss', 'auc'],
debug=False,
print_period=args.print_steps)
model_dir = os.path.join(args.model_output_dir,
'epoch_' + str(epoch_id + 1), "checkpoint")
sys.stderr.write('epoch%d is finished and takes %f s\n' % (
(epoch_id + 1), time.time() - start))
if args.trainer_id == 0: # only trainer 0 save model
print("save model in {}".format(model_dir))
fluid.save(main_program, model_dir)
print("train time cost {:.4f}".format(time.time() - start_time))
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
t = fluid.DistributeTranspiler()
t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog, pserver_startup = t.get_pserver_programs(
args.current_endpoint)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
utils.check_version()
train()
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--is_local 0 \
--is_sparse \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
# start pserver1
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--is_local 0 \
--is_sparse \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--use_gpu 0 \
--is_local 0 \
--is_sparse \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--use_gpu 0 \
--is_local 0 \
--is_sparse \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
echo "2 pservers and 2 trainers started."
\ No newline at end of file
......@@ -76,11 +76,10 @@ class DCN(object):
def backward(self, lr):
p_g_clip = fluid.backward.append_backward(loss=self.loss)
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_by_norm))
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_by_norm)
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
optimizer = fluid.optimizer.Adam(learning_rate=lr)
optimizer = fluid.optimizer.Adam(learning_rate=lr, grad_clip=clip)
# params_grads = optimizer.backward(self.loss)
optimizer.apply_gradients(p_g_clip)
......
......@@ -72,8 +72,8 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
yield label_feat_list
import paddle
batch_iter = paddle.batch(
paddle.reader.buffered(
batch_iter = fluid.io.batch(
fluid.io.buffered(
local_iter, size=buf_size), batch_size=batch)
return batch_iter
......
import argparse
import os
import sys
import time
from network_conf import ctr_deepfm_model
import paddle.fluid as fluid
import utils
def parse_args():
parser = argparse.ArgumentParser("deepfm cluster train.")
parser.add_argument(
'--train_data_dir',
type=str,
default='dist_data/dist_train_data',
help='The path of train data (default: data/train_data)')
parser.add_argument(
'--test_data_dir',
type=str,
default='dist_data/dist_test_data',
help='The path of test data (default: models)')
parser.add_argument(
'--feat_dict',
type=str,
default='dist_data/aid_data/feat_dict_10.pkl2',
help='The path of feat_dict')
parser.add_argument(
'--batch_size',
type=int,
default=100,
help="The size of mini-batch (default:100)")
parser.add_argument(
'--embedding_size',
type=int,
default=10,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--num_epoch',
type=int,
default=10,
help="The number of epochs to train (default: 50)")
parser.add_argument(
'--model_output_dir',
type=str,
required=True,
help='The path for model to store (default: models)')
parser.add_argument(
'--num_thread',
type=int,
default=1,
help='The number of threads (default: 1)')
parser.add_argument('--test_epoch', type=str, default='1')
parser.add_argument(
'--layer_sizes',
nargs='+',
type=int,
default=[400, 400, 400],
help='The size of each layers (default: [10, 10, 10])')
parser.add_argument(
'--act',
type=str,
default='relu',
help='The activation of each layers (default: relu)')
parser.add_argument(
'--is_sparse',
action='store_true',
required=False,
default=False,
help='embedding will use sparse or not, (default: False)')
parser.add_argument(
'--lr', type=float, default=1e-4, help='Learning rate (default: 1e-4)')
parser.add_argument(
'--reg', type=float, default=1e-4, help=' (default: 1e-4)')
parser.add_argument('--num_field', type=int, default=39)
parser.add_argument('--num_feat', type=int, default=141443)
parser.add_argument('--use_gpu', type=int, default=1)
# dist params
parser.add_argument('--is_local', type=int, default=1, help='whether local')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def train():
""" do training """
args = parse_args()
print(args)
if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
loss, auc, data_list, auc_states = ctr_deepfm_model(
args.embedding_size, args.num_field, args.num_feat, args.layer_sizes,
args.act, args.reg, args.is_sparse)
optimizer = fluid.optimizer.SGD(
learning_rate=args.lr,
regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
optimizer.minimize(loss)
def train_loop(main_program):
""" train network """
start_time = time.time()
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(data_list)
pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict)
dataset.set_pipe_command(pipe_command)
dataset.set_batch_size(args.batch_size)
dataset.set_thread(args.num_thread)
train_filelist = [
os.path.join(args.train_data_dir, x)
for x in os.listdir(args.train_data_dir)
]
if args.use_gpu == 1:
exe = fluid.Executor(fluid.CUDAPlace(0))
dataset.set_thread(1)
else:
exe = fluid.Executor(fluid.CPUPlace())
dataset.set_thread(args.num_thread)
exe.run(fluid.default_startup_program())
for epoch_id in range(args.num_epoch):
start = time.time()
sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
dataset.set_filelist(train_filelist)
exe.train_from_dataset(
program=main_program,
dataset=dataset,
fetch_list=[loss, auc],
fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
print_period=5,
debug=False)
model_dir = os.path.join(args.model_output_dir,
'epoch_' + str(epoch_id + 1))
sys.stderr.write('epoch%d is finished and takes %f s\n' % (
(epoch_id + 1), time.time() - start))
if args.trainer_id == 0: # only trainer 0 save model
print("save model in {}".format(model_dir))
fluid.save(main_program, model_dir)
print("train time cost {:.4f}".format(time.time() - start_time))
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
t = fluid.DistributeTranspiler()
t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog, pserver_startup = t.get_pserver_programs(
args.current_endpoint)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
utils.check_version()
train()
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--is_local 0 \
--is_sparse \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
# start pserver1
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--is_local 0 \
--is_sparse \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--use_gpu 0 \
--is_local 0 \
--is_sparse \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
python -u cluster_train.py \
--train_data_dir dist_data/dist_train_data \
--model_output_dir cluster_model \
--use_gpu 0 \
--is_local 0 \
--is_sparse \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
echo "2 pservers and 2 trainers started."
\ No newline at end of file
......@@ -30,7 +30,7 @@ def infer():
]
criteo_dataset = CriteoDataset()
criteo_dataset.setup(args.feat_dict)
test_reader = paddle.batch(
test_reader = fluid.io.batch(
criteo_dataset.test(test_files), batch_size=args.batch_size)
startup_program = fluid.framework.Program()
......
......@@ -6,6 +6,7 @@ import pickle
import random
import paddle
import paddle.fluid as fluid
class DataGenerator(object):
......@@ -58,7 +59,7 @@ class DataGenerator(object):
if not cycle:
break
return paddle.batch(_reader, batch_size=batch_size)
return fluid.io.batch(_reader, batch_size=batch_size)
def data_reader(batch_size,
......
......@@ -8,8 +8,6 @@
├── train.py # 训练脚本
├── infer.py # 预测脚本
├── network.py # 网络结构
├── cluster_train.py # 多机训练
├── cluster_train.sh # 多机训练脚本
├── reader.py # 和读取数据相关的函数
├── data/
├── build_dataset.py # 文本数据转化为paddle数据
......@@ -129,12 +127,3 @@ CUDA_VISIBLE_DEVICES=3 python infer.py --model_path 'din_amazon/global_step_4000
```text
2019-02-22 11:22:58,804 - INFO - TEST --> loss: [0.47005194] auc:0.863794952818
```
## 多机训练
可参考cluster_train.py 配置多机环境
运行命令本地模拟多机场景
```
sh cluster_train.sh
```
import sys
import logging
import time
import numpy as np
import argparse
import paddle.fluid as fluid
import paddle
import time
import network
import reader
import random
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser("din")
parser.add_argument(
'--config_path',
type=str,
default='data/config.txt',
help='dir of config')
parser.add_argument(
'--train_dir',
type=str,
default='data/paddle_train.txt',
help='dir of train file')
parser.add_argument(
'--model_dir',
type=str,
default='din_amazon/',
help='dir of saved model')
parser.add_argument(
'--batch_size', type=int, default=16, help='number of batch size')
parser.add_argument(
'--epoch_num', type=int, default=200, help='number of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether to use gpu')
parser.add_argument(
'--parallel',
type=int,
default=0,
help='whether to use parallel executor')
parser.add_argument(
'--base_lr', type=float, default=0.85, help='based learning rate')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def train():
args = parse_args()
config_path = args.config_path
train_path = args.train_dir
epoch_num = args.epoch_num
use_cuda = True if args.use_cuda else False
use_parallel = True if args.parallel else False
logger.info("reading data begins")
user_count, item_count, cat_count = reader.config_read(config_path)
#data_reader, max_len = reader.prepare_reader(train_path, args.batch_size)
logger.info("reading data completes")
avg_cost, pred = network.network(item_count, cat_count, 433)
#fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
base_lr = args.base_lr
boundaries = [410000]
values = [base_lr, 0.2]
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.piecewise_decay(
boundaries=boundaries, values=values))
sgd_optimizer.minimize(avg_cost)
def train_loop(main_program):
data_reader, max_len = reader.prepare_reader(train_path,
args.batch_size)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
feeder = fluid.DataFeeder(
feed_list=[
"hist_item_seq", "hist_cat_seq", "target_item", "target_cat",
"label", "mask", "target_item_seq", "target_cat_seq"
],
place=place)
if use_parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=avg_cost.name,
main_program=main_program)
else:
train_exe = exe
logger.info("train begins")
global_step = 0
PRINT_STEP = 1000
start_time = time.time()
loss_sum = 0.0
for id in range(epoch_num):
epoch = id + 1
for data in data_reader():
global_step += 1
results = train_exe.run(main_program,
feed=feeder.feed(data),
fetch_list=[avg_cost.name, pred.name],
return_numpy=True)
loss_sum += results[0].mean()
if global_step % PRINT_STEP == 0:
logger.info(
"epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f"
% (epoch, global_step, loss_sum / PRINT_STEP,
time.time() - start_time))
start_time = time.time()
loss_sum = 0.0
if (global_step > 400000 and
global_step % PRINT_STEP == 0) or (
global_step < 400000 and
global_step % 50000 == 0):
save_dir = args.model_dir + "/global_step_" + str(
global_step)
feed_var_name = [
"hist_item_seq", "hist_cat_seq", "target_item",
"target_cat", "label", "mask", "target_item_seq",
"target_cat_seq"
]
fetch_vars = [avg_cost, pred]
fluid.io.save_inference_model(save_dir, feed_var_name,
fetch_vars, exe)
train_exe.close()
t = fluid.DistributeTranspiler()
t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
logger.info("run psever")
prog, startup = t.get_pserver_programs(args.current_endpoint)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(startup)
exe.run(prog)
elif args.role == "trainer":
logger.info("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
train()
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
python -u cluster_train.py \
--config_path 'data/config.txt' \
--train_dir 'data/paddle_train.txt' \
--batch_size 32 \
--epoch_num 100 \
--use_cuda 0 \
--parallel 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
python -u cluster_train.py \
--config_path 'data/config.txt' \
--train_dir 'data/paddle_train.txt' \
--batch_size 32 \
--epoch_num 100 \
--use_cuda 0 \
--parallel 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
python -u cluster_train.py \
--config_path 'data/config.txt' \
--train_dir 'data/paddle_train.txt' \
--batch_size 32 \
--epoch_num 100 \
--use_cuda 0 \
--parallel 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
python -u cluster_train.py \
--config_path 'data/config.txt' \
--train_dir 'data/paddle_train.txt' \
--batch_size 32 \
--epoch_num 100 \
--use_cuda 0 \
--parallel 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
......@@ -92,14 +92,15 @@ def train():
logger.info("reading data completes")
avg_cost, pred, feed_list = network.network(item_count, cat_count)
fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
clip_norm=5.0))
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
base_lr = args.base_lr
boundaries = [410000]
values = [base_lr, 0.2]
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.piecewise_decay(
boundaries=boundaries, values=values))
boundaries=boundaries, values=values),
grad_clip=clip)
sgd_optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
......@@ -154,7 +154,7 @@ def embedding_layer(input):
return fluid.layers.embedding(
input=input,
is_sparse=True,
size=[args.sparse_feature_dim,
size=[args.sparse_feature_dim,
args.embedding_size],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
......@@ -168,7 +168,7 @@ sparse_embed_seq = list(map(embedding_layer, inputs[1:-1])) # [C1~C26]
将离散数据通过embedding查表得到的值,与连续数据的输入进行`concat`操作,合为一个整体输入,作为全链接层的原始输入。我们共设计了3层FC,每层FC的输出维度都为400,每层FC都后接一个`relu`激活函数,每层FC的初始化方式为符合正态分布的随机初始化,标准差与上一层的输出维度的平方根成反比。
```python
concated = fluid.layers.concat(sparse_embed_seq + inputs[0:1], axis=1)
fc1 = fluid.layers.fc(
input=concated,
size=400,
......@@ -271,9 +271,9 @@ continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
class CriteoDataset(dg.MultiSlotDataGenerator):
def generate_sample(self, line):
def reader():
features = line.rstrip('\n').split('\t')
dense_feature = []
......@@ -363,12 +363,12 @@ fleet.init(role) #必不可少的步骤,初始化节点!
> PaddleCloudRoleMaker()是怎样判断当前节点所扮演的角色的?
>
>
> Paddle参数服务器模式中,使用各个节点机器的环境变量来确定当前节点的角色。为了能准确无误的分配角色,在每个节点上,我们都需要指定如下环境变量:
> #### 共有的环境变量
> - export PADDLE_TRAINERS_NUM=2 # 训练节点数
> - export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:36011,127.0.0.1:36012" # 各个pserver的ip:port 组合构成的字符串
>
>
> #### Pserver特有的环境变量
> - export TRAINING_ROLE=PSERVER # 当前节点的角色是PSERVER
> - export PADDLE_PORT=36011 # 当前PSERVER的通信端口
......@@ -376,7 +376,7 @@ fleet.init(role) #必不可少的步骤,初始化节点!
> #### Trainer特有的环境变量
> - export TRAINING_ROLE=TRAINER # 当前节点的角色是TRAINER
> - export PADDLE_TRAINER_ID=0 # 当前Trainer节点的编号,范围为[0,PADDLE_TRAINERS_NUM)
>
>
> 完成上述环境变量指定后,`PaddleCloudRoleMaker()`便可以正常的运行,决定当前节点的角色。
......@@ -388,7 +388,7 @@ Paddle的`参数服务器`模式分布式训练有很多种类型,根据通信
ctr_model = CTR()
inputs = ctr_model.input_data(args)
avg_cost, auc_var, batch_auc_var = ctr_model.net(inputs,args)
# 选择反向更新优化策略
optimizer = fluid.optimizer.Adam(args.learning_rate)
optimizer.minimize(avg_cost)
......@@ -431,7 +431,7 @@ if fleet.is_server():
fleet.run_server()
```
- 启动Worker
启动训练节点,训练节点首先调用`init_worker()`来完成节点初始化,然后执行`fleet.startup_program`,从服务器端同步参数的初始化值。接着,和本地训练完全一致,通过执行`fleet.main_program`来完成整个训练过程,并保存模型。最后调用`fleet.stop_worker()`关闭训练节点。
```python
elif fleet.is_worker():
......@@ -441,7 +441,7 @@ elif fleet.is_worker():
# 初始化含有分布式流程的fleet.startup_program
exe.run(fleet.startup_program))
# 引入数据读取dataset
dataset = get_dataset(inputs,params)
......@@ -458,10 +458,10 @@ elif fleet.is_worker():
# 默认使用0号节点保存模型
if params.test and fleet.is_first_worker():
model_path = (str(params.model_path) + "/"+"epoch_" + str(epoch))
fluid.io.save_persistables(executor=exe, dirname=model_path)
fleet.save_persistables(executor=exe, dirname=model_path)
# 训练结束,调用stop_worker()通知pserver
fleet.stop_worker()
fleet.stop_worker()
logger.info("Distribute Train Success!")
return train_result
```
......@@ -504,7 +504,7 @@ sh local_cluster.sh
便可以开启分布式模拟训练,默认启用2x2的训练模式。Trainer与Pserver的运行日志,存放于`./log/`文件夹,保存的模型位于`./models/`,使用默认配置运行后,理想输出为:
- pserver.0.log
```bash
get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.
I1126 07:37:49.952580 15056 grpc_server.cc:477] Server listening on 127.0.0.1:36011 successful, selected port: 36011
```
......@@ -558,9 +558,9 @@ I1126 07:38:28.947571 14715 communicator.cc:363] Communicator stop done
2. 在很多应用场景中,分布式训练出的模型与实际上线的模型不一致,仅使用分布式训练出的参数值,参与其他网络的预测,在这样的场景中,就更无必要保存模型结构了。
> 什么是长期变量?
>
>
> 在Paddle Fluid中,模型变量可以分为以下几种类型:
>
>
> 1. 模型参数:是深度学习模型中被训练和学习的量。由`fluid.framwork.Parameter()`产生,是`fluid.framework.Variable()`的派生类。
> 2. 长期变量 :是在整个训练过程中持续存在,不会因为一个迭代结束而销毁的变量,所有的模型参数都是长期变量,但并非所有的长期变量都是模型参数。长期变量通过将`fluid.framework.Varibale()`中的`psersistable`属性设置为`True`来声明。长期变量是模型的核心参数。
> 3. 临时变量:不属于上述两种类别的所有变量都是临时变量,只在一个训练迭代中存在,在每一个迭代结束后,所有的临时变量都会被销毁,然后在下一个迭代开始时,创建新的临时变量。例如输入的训练数据,中间层layer的输出等等。
......@@ -632,7 +632,7 @@ with fluid.framework.program_guard(test_program, startup_program):
```
这是容易理解的,因为在测试时,我们要从零开始,保证预测program的干净,没有其他的影响因素。
- 在创建预测网络时,我们加入了`with fluid.unique_name.guard():`,它的作用是让所有新建的参数的自动编号再次从零开始。Paddle的参数`Variable`以变量名作为区分手段,保证变量名相同,就可以从保存的模型中找到对应参数。
paddle创建的临时变量,编号会自动顺延,如果没有指定变量名,可以观察到这一现象,比如:`fc_1.w_0`->`fc_2.w_0`,想要共享相同的参数,必需要保证编号可以对应。
### 测试数据的读取
......@@ -774,14 +774,14 @@ python -u train.py --is_cloud=1
运行该命令时,若pserver还未就绪,可在日志输出中看到如下信息:
> server not ready, wait 3 sec to retry...
>
>
> not ready endpoints:['10.89.176.11:36000', '10.89.176.12:36000']
worker进程将持续等待,直到server开始监听,或等待超时。
当pserver都准备就绪后,可以在日志输出看到如下信息:
> I0317 11:38:48.099179 16719 communicator.cc:271] Communicator start
>
>
> I0317 11:38:49.838711 16719 rpc_client.h:107] init rpc client with trainer_id 0
至此,分布式训练启动完毕,将开始训练,祝您好运。
......@@ -30,8 +30,7 @@ logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle CTR-DNN example")
parser = argparse.ArgumentParser(description="PaddlePaddle CTR-DNN example")
# -------------Data & Model Path-------------
parser.add_argument(
'--test_files_path',
......@@ -54,8 +53,7 @@ def parse_args():
'--infer_epoch',
type=int,
default=0,
help='Specify which epoch to run infer'
)
help='Specify which epoch to run infer')
# -------------Network parameter-------------
parser.add_argument(
'--embedding_size',
......@@ -68,10 +66,7 @@ def parse_args():
default=1000001,
help='sparse feature hashing space for index processing')
parser.add_argument(
'--dense_feature_dim',
type=int,
default=13,
help='dense feature shape')
'--dense_feature_dim', type=int, default=13, help='dense feature shape')
# -------------device parameter-------------
parser.add_argument(
......@@ -102,10 +97,11 @@ def run_infer(args, model_path):
place = fluid.CPUPlace()
train_generator = generator.CriteoDataset(args.sparse_feature_dim)
file_list = [
os.path.join(args.test_files_path, x) for x in os.listdir(args.test_files_path)
os.path.join(args.test_files_path, x)
for x in os.listdir(args.test_files_path)
]
test_reader = paddle.batch(train_generator.test(file_list),
batch_size=args.batch_size)
test_reader = fluid.io.batch(
train_generator.test(file_list), batch_size=args.batch_size)
startup_program = fluid.framework.Program()
test_program = fluid.framework.Program()
ctr_model = CTR()
......@@ -171,13 +167,15 @@ if __name__ == "__main__":
model_list = []
for _, dir, _ in os.walk(args.model_path):
for model in dir:
if "epoch" in model and args.infer_epoch == int(model.split('_')[-1]):
if "epoch" in model and args.infer_epoch == int(
model.split('_')[-1]):
path = os.path.join(args.model_path, model)
model_list.append(path)
if len(model_list) == 0:
logger.info("There is no satisfactory model {} at path {}, please check your start command & env. ".format(
str("epoch_")+str(args.infer_epoch), args.model_path))
logger.info(
"There is no satisfactory model {} at path {}, please check your start command & env. ".
format(str("epoch_") + str(args.infer_epoch), args.model_path))
for model in model_list:
logger.info("Test model {}".format(model))
......
import argparse
import os
import sys
import time
import network_conf
import paddle.fluid as fluid
import utils
def parse_args():
parser = argparse.ArgumentParser("xdeepfm cluster train.")
parser.add_argument(
'--train_data_dir',
type=str,
default='data/train_data',
help='The path of train data (default: data/train_data)')
parser.add_argument(
'--test_data_dir',
type=str,
default='data/test_data',
help='The path of test data (default: models)')
parser.add_argument(
'--batch_size',
type=int,
default=100,
help="The size of mini-batch (default:100)")
parser.add_argument(
'--embedding_size',
type=int,
default=10,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--num_epoch',
type=int,
default=10,
help="The number of epochs to train (default: 10)")
parser.add_argument(
'--model_output_dir',
type=str,
required=True,
help='The path for model to store (default: models)')
parser.add_argument(
'--num_thread',
type=int,
default=1,
help='The number of threads (default: 1)')
parser.add_argument('--test_epoch', type=str, default='1')
parser.add_argument(
'--layer_sizes_dnn',
nargs='+',
type=int,
default=[10, 10, 10],
help='The size of each layers')
parser.add_argument(
'--layer_sizes_cin',
nargs='+',
type=int,
default=[10, 10],
help='The size of each layers')
parser.add_argument(
'--act',
type=str,
default='relu',
help='The activation of each layers (default: relu)')
parser.add_argument(
'--lr', type=float, default=1e-1, help='Learning rate (default: 1e-4)')
parser.add_argument(
'--reg', type=float, default=1e-4, help=' (default: 1e-4)')
parser.add_argument('--num_field', type=int, default=39)
parser.add_argument('--num_feat', type=int, default=28651)
parser.add_argument(
'--model_name',
type=str,
default='ctr_xdeepfm_model',
help='The name of model (default: ctr_xdeepfm_model)')
parser.add_argument('--use_gpu', type=int, default=1)
parser.add_argument('--print_steps', type=int, default=50)
parser.add_argument('--is_local', type=int, default=1, help='whether local')
parser.add_argument(
'--is_sparse',
action='store_true',
required=False,
default=False,
help='embedding will use sparse or not, (default: False)')
# dist params
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def train():
""" do training """
args = parse_args()
print(args)
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)(
args.embedding_size, args.num_field, args.num_feat,
args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin,
args.is_sparse)
optimizer = fluid.optimizer.SGD(
learning_rate=args.lr,
regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
optimizer.minimize(loss)
def train_loop(main_program):
""" train network """
start_time = time.time()
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(data_list)
dataset.set_pipe_command('python criteo_reader.py')
dataset.set_batch_size(args.batch_size)
dataset.set_filelist([
os.path.join(args.train_data_dir, x)
for x in os.listdir(args.train_data_dir)
])
if args.use_gpu == 1:
exe = fluid.Executor(fluid.CUDAPlace(0))
dataset.set_thread(1)
else:
exe = fluid.Executor(fluid.CPUPlace())
dataset.set_thread(args.num_thread)
exe.run(fluid.default_startup_program())
for epoch_id in range(args.num_epoch):
start = time.time()
sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
exe.train_from_dataset(
program=main_program,
dataset=dataset,
fetch_list=[loss, auc],
fetch_info=['loss', 'auc'],
debug=False,
print_period=args.print_steps)
model_dir = os.path.join(args.model_output_dir,
'epoch_' + str(epoch_id + 1), "checkpoint")
sys.stderr.write('epoch%d is finished and takes %f s\n' % (
(epoch_id + 1), time.time() - start))
if args.trainer_id == 0: # only trainer 0 save model
print("save model in {}".format(model_dir))
fluid.save(main_program, model_dir)
print("train time cost {:.4f}".format(time.time() - start_time))
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
t = fluid.DistributeTranspiler()
t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog, pserver_startup = t.get_pserver_programs(
args.current_endpoint)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
utils.check_version()
train()
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
python -u cluster_train.py \
--train_data_dir data/train_data \
--model_output_dir cluster_model \
--is_local 0 \
--is_sparse \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
# start pserver1
python -u cluster_train.py \
--train_data_dir data/train_data \
--model_output_dir cluster_model \
--is_local 0 \
--is_sparse \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
python -u cluster_train.py \
--train_data_dir data/train_data \
--model_output_dir cluster_model \
--use_gpu 0 \
--is_local 0 \
--is_sparse \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
python -u cluster_train.py \
--train_data_dir data/train_data \
--model_output_dir cluster_model \
--use_gpu 0 \
--is_local 0 \
--is_sparse \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
echo "2 pservers and 2 trainers started."
\ No newline at end of file
......@@ -30,7 +30,7 @@ def infer():
for x in os.listdir(args.test_data_dir)
]
criteo_dataset = CriteoDataset()
test_reader = paddle.batch(
test_reader = fluid.io.batch(
criteo_dataset.test(test_files), batch_size=args.batch_size)
startup_program = fluid.framework.Program()
......
......@@ -62,10 +62,9 @@ def infer(args):
for epoch_num in range(args.start_index, args.last_index + 1):
model_path = os.path.join(args.model_path, "epoch_" + str(epoch_num))
try:
if not os.path.exists(model_path):
if not os.path.exists(model_path + ".pdmodel"):
raise ValueError()
fluid.io.load_persistables(executor=exe, dirname=model_path,
main_program=infer_program)
fluid.io.load(infer_program, model_path+".pdmodel", exe)
loss_sum = 0.0
acc_sum = 0.0
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册