classify.py

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
import paddle.fluid as fluid


def build_lr_model(args):
    """
    Build the LR model to train.
    """
    emb_x = fluid.layers.data(
        name="emb_x", dtype='float32', shape=[args.w2v_emb_size])
    label = fluid.layers.data(name="label_y", dtype='int64', shape=[1])
    logits = fluid.layers.fc(input=emb_x,
                             size=args.num_class,
                             act=None,
                             name='classification_layer')
    proba = fluid.layers.softmax(logits)
    loss = fluid.layers.softmax_with_cross_entropy(logits, label)
    loss = fluid.layers.mean(loss)
    acc = fluid.layers.accuracy(input=proba, label=label, k=1)
    return loss, acc


def construct_feed_data(data):
    """
    Construct the data to feed model.
    """
    datas = []
    labels = []
    for sample in data:
        if len(datas) < 16:
            labels.append([sample[-1]])
            datas.append(sample[1:-1])
        else:
            yield np.array(datas).astype(np.float32), np.array(labels).astype(
                np.int64)
            datas = []
            labels = []
    if len(datas) != 0:
        yield np.array(datas).astype(np.float32), np.array(labels).astype(
            np.int64)


def run_epoch(exe, data, program, stage, epoch, loss, acc):
    """
    The epoch funtcion to run each epoch.
    """
    print('start {} epoch of {}'.format(stage, epoch))
    all_loss = 0.0
    all_acc = 0.0
    all_samples = 0.0
    count = 0
    for datas, labels in construct_feed_data(data):
        batch_loss, batch_acc = exe.run(
            program,
            fetch_list=[loss, acc],
            feed={"emb_x": datas,
                  "label_y": labels})
        len_samples = len(datas)
        all_loss = batch_loss * len_samples
        all_acc = batch_acc * len_samples
        all_samples += len_samples
        count += 1
    print("pass:{}, epoch:{}, loss:{}, acc:{}".format(stage, epoch, batch_loss,
                                                      all_acc / (len_samples)))


def train_lr_model(args, data):
    """
    The main function to run the lr model.
    """
    data_nums = len(data)
    train_data_nums = int(0.8 * data_nums)
    train_data = data[:train_data_nums]
    test_data = data[train_data_nums:]

    place = fluid.CPUPlace()

    train_program = fluid.Program()
    startup_program = fluid.Program()

    with fluid.program_guard(train_program, startup_program):
        loss, acc = build_lr_model(args)
    test_program = train_program.clone(for_test=True)

    with fluid.program_guard(train_program, startup_program):
        adam = fluid.optimizer.Adam(learning_rate=args.lr)
        adam.minimize(loss)

    exe = fluid.Executor(place)
    exe.run(startup_program)

    for epoch in range(0, args.epoch):
        run_epoch(exe, train_data, train_program, "train", epoch, loss, acc)
        print('-------------------')
        run_epoch(exe, test_data, test_program, "valid", epoch, loss, acc)