local_train.py 2.9 KB
Newer Older
B
barrierye 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing

G
guru4elephant 已提交
16 17 18 19 20 21
from __future__ import print_function

from args import parse_args
import os
import paddle.fluid as fluid
import sys
G
guru4elephant 已提交
22
from network_conf import dnn_model
G
guru4elephant 已提交
23 24 25

dense_feature_dim = 13

G
guru4elephant 已提交
26

G
guru4elephant 已提交
27 28
def train():
    args = parse_args()
G
guru4elephant 已提交
29
    sparse_only = args.sparse_only
G
guru4elephant 已提交
30 31 32 33 34
    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)
    dense_input = fluid.layers.data(
        name="dense_input", shape=[dense_feature_dim], dtype='float32')
    sparse_input_ids = [
B
barrierye 已提交
35 36 37 38
        fluid.layers.data(
            name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
        for i in range(1, 27)
    ]
G
guru4elephant 已提交
39 40
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

G
guru4elephant 已提交
41 42 43
    #nn_input = None if sparse_only else dense_input
    nn_input = dense_input
    predict_y, loss, auc_var, batch_auc_var = dnn_model(
B
barrierye 已提交
44 45
        nn_input, sparse_input_ids, label, args.embedding_size,
        args.sparse_feature_dim)
G
guru4elephant 已提交
46 47 48 49 50 51 52 53

    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
    optimizer.minimize(loss)

    exe = fluid.Executor(fluid.CPUPlace())
    exe.run(fluid.default_startup_program())
    dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
    dataset.set_use_var([dense_input] + sparse_input_ids + [label])
G
guru4elephant 已提交
54

G
guru4elephant 已提交
55
    python_executable = "python"
B
barrierye 已提交
56 57
    pipe_command = "{} criteo_reader.py {}".format(python_executable,
                                                   args.sparse_feature_dim)
G
guru4elephant 已提交
58

G
guru4elephant 已提交
59 60 61 62
    dataset.set_pipe_command(pipe_command)
    dataset.set_batch_size(128)
    thread_num = 10
    dataset.set_thread(thread_num)
G
guru4elephant 已提交
63

B
barrierye 已提交
64 65 66
    whole_filelist = [
        "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
    ]
G
guru4elephant 已提交
67

G
guru4elephant 已提交
68 69 70 71 72
    dataset.set_filelist(whole_filelist[:thread_num])
    dataset.load_into_memory()

    epochs = 1
    for i in range(epochs):
G
guru4elephant 已提交
73
        exe.train_from_dataset(
B
barrierye 已提交
74
            program=fluid.default_main_program(), dataset=dataset, debug=True)
G
guru4elephant 已提交
75 76 77 78 79 80 81 82
        print("epoch {} finished".format(i))

    import paddle_serving_client.io as server_io
    feed_var_dict = {}
    for i, sparse in enumerate(sparse_input_ids):
        feed_var_dict["sparse_{}".format(i)] = sparse
    fetch_var_dict = {"prob": predict_y}

B
barrierye 已提交
83 84 85
    server_io.save_model("ctr_serving_model", "ctr_client_conf", feed_var_dict,
                         fetch_var_dict, fluid.default_main_program())

G
guru4elephant 已提交
86 87 88

if __name__ == '__main__':
    train()