test_export_n_load_module.py

#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle.fluid as fluid
import paddle
import paddle_hub as hub
import unittest
import os

from collections import defaultdict

EMBED_SIZE = 16
HIDDEN_SIZE = 256
N = 5
BATCH_SIZE = 64
PASS_NUM = 1000

word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
data = paddle.dataset.imikolov.train(word_dict, N)

_MOCK_DATA = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]


def mock_data():
    for d in _MOCK_DATA:
        yield d


batch_reader = paddle.batch(mock_data, BATCH_SIZE)
#batch_reader = paddle.batch(data, BATCH_SIZE)
batch_size = 0
for d in batch_reader():
    batch_size += 1


def word2vec(words, is_sparse, trainable=True):
    emb_param_attr = fluid.ParamAttr(name="embedding", trainable=trainable)
    embed_first = fluid.layers.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr=emb_param_attr)
    embed_second = fluid.layers.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr=emb_param_attr)
    embed_third = fluid.layers.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr=emb_param_attr)
    embed_fourth = fluid.layers.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr=emb_param_attr)

    concat_emb = fluid.layers.concat(
        input=[embed_first, embed_second, embed_third, embed_fourth], axis=1)
    hidden1 = fluid.layers.fc(input=concat_emb, size=HIDDEN_SIZE, act='sigmoid')
    pred_prob = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')

    # declare later than predict word
    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')

    cost = fluid.layers.cross_entropy(input=pred_prob, label=next_word)
    avg_cost = fluid.layers.mean(cost)

    return pred_prob, avg_cost


def get_dictionary(word_dict):
    dictionary = defaultdict(int)
    w_id = 0
    for w in word_dict:
        if isinstance(w, bytes):
            w = w.decode("ascii")
        dictionary[w] = w_id
        w_id += 1

    return dictionary


def test_create_w2v_module(use_gpu=False):
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()

    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    pred_prob, avg_cost = word2vec(word_list, is_sparse=True)

    main_program = fluid.default_main_program()
    startup_program = fluid.default_startup_program()

    sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=1e-2)

    sgd_optimizer.minimize(avg_cost)
    exe = fluid.Executor(place)
    exe.run(startup_program)  # initialization

    step = 0
    for epoch in range(0, PASS_NUM):
        for mini_batch in batch_reader():
            feed_var_list = [
                main_program.global_block().var("firstw"),
                main_program.global_block().var("secondw"),
                main_program.global_block().var("thirdw"),
                main_program.global_block().var("fourthw"),
                main_program.global_block().var("nextw")
            ]
            feeder = fluid.DataFeeder(feed_list=feed_var_list, place=place)
            cost = exe.run(
                main_program,
                feed=feeder.feed(mini_batch),
                fetch_list=[avg_cost])
            step += 1
            if step % 100 == 0:
                print("Epoch={} Step={} Cost={}".format(epoch, step, cost[0]))

    saved_module_dir = "./tmp/word2vec_test_module"
    # save inference model including feed and fetch variable info
    dictionary = get_dictionary(word_dict)

    module_inputs = [
        main_program.global_block().var("firstw"),
        main_program.global_block().var("secondw"),
        main_program.global_block().var("thirdw"),
        main_program.global_block().var("fourthw"),
    ]
    signature = hub.create_signature(
        "default",
        inputs=module_inputs,
        outputs=[pred_prob],
        feed_names=["firstw", "secondw", "thirdw", "fourthw"],
        fetch_names=["pred_prob"])
    hub.create_module(
        sign_arr=signature, module_dir=saved_module_dir, word_dict=dictionary)


def test_load_w2v_module(use_gpu=False):
    saved_module_dir = "./tmp/word2vec_test_module"
    w2v_module = hub.Module(module_dir=saved_module_dir)
    feed_dict, fetch_dict, program = w2v_module(
        sign_name="default", trainable=False)
    with fluid.program_guard(main_program=program):
        pred_prob = fetch_dict["pred_prob"]
        pred_word = fluid.layers.argmax(x=pred_prob, axis=1)
        # set place, executor, datafeeder
        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
        exe = fluid.Executor(place)
        feed_vars = [
            feed_dict["firstw"], feed_dict["secondw"], feed_dict["thirdw"],
            feed_dict["fourthw"]
        ]
        feeder = fluid.DataFeeder(place=place, feed_list=feed_vars)

        word_ids = [[1, 2, 3, 4]]
        result = exe.run(
            fluid.default_main_program(),
            feed=feeder.feed(word_ids),
            fetch_list=[pred_word],
            return_numpy=True)

        print(result)


if __name__ == "__main__":
    use_gpu = False
    print("test create word2vec module")
    test_create_w2v_module(use_gpu)
    print("test load word2vec module")
    test_load_w2v_module(use_gpu=False)