Skip to content

  • 体验新版
    • 正在加载...
  • 登录
  • PaddlePaddle
  • book
  • Issue
  • #783

B
book
  • 项目概览

PaddlePaddle / book

通知 17
Star 4
Fork 0
  • 代码
    • 文件
    • 提交
    • 分支
    • Tags
    • 贡献者
    • 分支图
    • Diff
  • Issue 40
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 37
  • Wiki 5
    • Wiki
  • 分析
    • 仓库
    • DevOps
  • 项目成员
  • Pages
B
book
  • 项目概览
    • 项目概览
    • 详情
    • 发布
  • 仓库
    • 仓库
    • 文件
    • 提交
    • 分支
    • 标签
    • 贡献者
    • 分支图
    • 比较
  • Issue 40
    • Issue 40
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 37
    • 合并请求 37
  • Pages
  • 分析
    • 分析
    • 仓库分析
    • DevOps
  • Wiki 5
    • Wiki
  • 成员
    • 成员
  • 收起侧边栏
  • 动态
  • 分支图
  • 创建新Issue
  • 提交
  • Issue看板
已关闭
开放中
Opened 7月 24, 2019 by saxon_zh@saxon_zhGuest

如何提取训练好的embedding?

Created by: hflyzju

#coding:utf-8
from __future__ import print_function
import paddle as paddle
import paddle.fluid as fluid
import six
import numpy
import math
import sys




EMBED_SIZE = 32      # embedding维度
HIDDEN_SIZE = 256    # 隐层大小
N = 5                # ngram大小,这里固定取5
BATCH_SIZE = 100     # batch大小
PASS_NUM = 100       # 训练轮数

use_cuda = False  # 如果用GPU训练,则设置为True

word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)


def inference_program(words, is_sparse):

    embed_first = fluid.layers.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
    embed_second = fluid.layers.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
    embed_third = fluid.layers.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
    embed_fourth = fluid.layers.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')

    concat_embed = fluid.layers.concat(
        input=[embed_first, embed_second, embed_third, embed_fourth], axis=1)
    hidden1 = fluid.layers.fc(input=concat_embed,
                              size=HIDDEN_SIZE,
                              act='sigmoid')
    predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
    return predict_word,embed_first


def train_program(predict_word):
    # 'next_word'的定义必须要在inference_program的声明之后,
    # 否则train program输入数据的顺序就变成了[next_word, firstw, secondw,
    # thirdw, fourthw], 这是不正确的.
    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost

def optimizer_func():
    return fluid.optimizer.AdagradOptimizer(
        learning_rate=3e-3,
        regularization=fluid.regularizer.L2DecayRegularizer(8e-4))


def train(if_use_cuda, params_dirname, embedding_params_dirname, is_sparse=True):
    place = fluid.CUDAPlace(0) if if_use_cuda else fluid.CPUPlace()

    train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

    main_program = fluid.default_main_program()
    star_program = fluid.default_startup_program()

    predict_word, embed_first = inference_program(word_list, is_sparse)
    avg_cost = train_program(predict_word)
    test_program = main_program.clone(for_test=True)

    sgd_optimizer = optimizer_func()
    sgd_optimizer.minimize(avg_cost)

    exe = fluid.Executor(place)

    def train_test(program, reader):
        count = 0
        feed_var_list = [
            program.global_block().var(var_name) for var_name in feed_order
        ]
        feeder_test = fluid.DataFeeder(feed_list=feed_var_list, place=place)
        test_exe = fluid.Executor(place)
        accumulated = len([avg_cost]) * [0]
        for test_data in reader():
            avg_cost_np = test_exe.run(
                program=program,
                feed=feeder_test.feed(test_data),
                fetch_list=[avg_cost])
            accumulated = [
                x[0] + x[1][0] for x in zip(accumulated, avg_cost_np)
            ]
            count += 1
        return [x / count for x in accumulated]

    def train_loop():
        step = 0
        feed_var_list_loop = [
            main_program.global_block().var(var_name) for var_name in feed_order
        ]
        feeder = fluid.DataFeeder(feed_list=feed_var_list_loop, place=place)
        exe.run(star_program)
        for pass_id in range(PASS_NUM):
            for data in train_reader():
                avg_cost_np = exe.run(
                    main_program, feed=feeder.feed(data), fetch_list=[avg_cost])

                if step % 10 == 0:
                    outs = train_test(test_program, test_reader)

                    print("Step %d: Average Cost %f" % (step, outs[0]))

                    # 整个训练过程要花费几个小时,如果平均损失低于5.8,
                    # 我们就认为模型已经达到很好的效果可以停止训练了。
                    # 注意5.8是一个相对较高的值,为了获取更好的模型,可以将
                    # 这里的阈值设为3.5,但训练时间也会更长。
                    if outs[0] < 5.8:
                        if params_dirname is not None:
                            fluid.io.save_inference_model(params_dirname, [
                                'firstw', 'secondw', 'thirdw', 'fourthw'
                            ], [predict_word], exe)
                        # 保存embedding参数
                        if embedding_params_dirname is not None:
                            fluid.io.save_inference_model(embedding_params_dirname, [
                                'firstw', 'secondw', 'thirdw', 'fourthw'
                            ], [embed_first], exe)
                        return
                step += 1
                if math.isnan(float(avg_cost_np[0])):
                    sys.exit("got NaN loss, training failed.")

        raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))

    train_loop()

def infer(use_cuda, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = fluid.Executor(place)

    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # 使用fluid.io.load_inference_model获取inference program,
        # feed变量的名称feed_target_names和从scope中fetch的对象fetch_targets
        [inferencer, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(params_dirname, exe)

        # 设置输入,用四个LoDTensor来表示4个词语。这里每个词都是一个id,
        # 用来查询embedding表获取对应的词向量,因此其形状大小是[1]。
        # recursive_sequence_lengths设置的是基于长度的LoD,因此都应该设为[[1]]
        # 注意recursive_sequence_lengths是列表的列表
        data1 = numpy.asarray([[211]], dtype=numpy.int64)  # 'among'
        data2 = numpy.asarray([[6]], dtype=numpy.int64)  # 'a'
        data3 = numpy.asarray([[96]], dtype=numpy.int64)  # 'group'
        data4 = numpy.asarray([[4]], dtype=numpy.int64)  # 'of'
        lod = numpy.asarray([[1]], dtype=numpy.int64)

        first_word = fluid.create_lod_tensor(data1, lod, place)
        second_word = fluid.create_lod_tensor(data2, lod, place)
        third_word = fluid.create_lod_tensor(data3, lod, place)
        fourth_word = fluid.create_lod_tensor(data4, lod, place)

        assert feed_target_names[0] == 'firstw'
        assert feed_target_names[1] == 'secondw'
        assert feed_target_names[2] == 'thirdw'
        assert feed_target_names[3] == 'fourthw'

        # 构造feed词典 {feed_target_name: feed_target_data}
        # 预测结果包含在results之中
        results = exe.run(
            inferencer,
            feed={
                feed_target_names[0]: first_word,
                feed_target_names[1]: second_word,
                feed_target_names[2]: third_word,
                feed_target_names[3]: fourth_word
            },
            fetch_list=fetch_targets,
            return_numpy=False)

        print(numpy.array(results[0]))
        print('next word results[0]:',numpy.array(results[0]))
        print('next word results[0].shape:',numpy.array(results[0]).shape)
        most_possible_word_index = numpy.argmax(results[0])
        print(most_possible_word_index)
        print([
            key for key, value in six.iteritems(word_dict)
            if value == most_possible_word_index
        ][0])


def embedding_infer(use_cuda, embedding_params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = fluid.Executor(place)

    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # 使用fluid.io.load_inference_model获取inference program,
        # feed变量的名称feed_target_names和从scope中fetch的对象fetch_targets
        [inferencer, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(embedding_params_dirname, exe)

        # 设置输入,用四个LoDTensor来表示4个词语。这里每个词都是一个id,
        # 用来查询embedding表获取对应的词向量,因此其形状大小是[1]。
        # recursive_sequence_lengths设置的是基于长度的LoD,因此都应该设为[[1]]
        # 注意recursive_sequence_lengths是列表的列表
        data1 = numpy.asarray([[211]], dtype=numpy.int64)  # 'among'
        data2 = numpy.asarray([[6]], dtype=numpy.int64)  # 'a'
        data3 = numpy.asarray([[96]], dtype=numpy.int64)  # 'group'
        data4 = numpy.asarray([[4]], dtype=numpy.int64)  # 'of'
        lod = numpy.asarray([[1]], dtype=numpy.int64)

        first_word = fluid.create_lod_tensor(data1, lod, place)
        second_word = fluid.create_lod_tensor(data2, lod, place)
        third_word = fluid.create_lod_tensor(data3, lod, place)
        fourth_word = fluid.create_lod_tensor(data4, lod, place)

        assert feed_target_names[0] == 'firstw'
        assert feed_target_names[1] == 'secondw'
        assert feed_target_names[2] == 'thirdw'
        assert feed_target_names[3] == 'fourthw'

        # 构造feed词典 {feed_target_name: feed_target_data}
        # 预测结果包含在results之中
        print(feed_target_names)
        results = exe.run(
            inferencer,
            feed={
                feed_target_names[0]: first_word,
                feed_target_names[1]: second_word,
                feed_target_names[2]: third_word,
                feed_target_names[3]: fourth_word
            },
            # feed={
            #     feed_target_names[0]: data1,
            #     feed_target_names[1]: data2,
            #     feed_target_names[2]: data3,
            #     feed_target_names[3]: data4
            # },
            fetch_list=fetch_targets,
            return_numpy=False)

        print( word embedding results[0]:',numpy.array(results[0]))
        print( word embedding results[0].shape:',numpy.array(results[0]).shape)


def main(use_cuda, is_sparse):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return

    params_dirname = "word2vec.inference.model"
    embedding_params_dirname = "word2vec.embedding.inference.model"

    train(
        if_use_cuda=use_cuda,
        params_dirname=params_dirname,
        embedding_params_dirname=embedding_params_dirname,
        is_sparse=is_sparse)

    infer(use_cuda=use_cuda, params_dirname=params_dirname)

    embedding_infer(use_cuda=use_cuda,embedding_params_dirname=embedding_params_dirname)


main(use_cuda=use_cuda, is_sparse=True)


output:

Step 0: Average Cost 7.377010

Step 10: Average Cost 6.168406

Step 20: Average Cost 5.790505
[[0.02989654 0.03431715 0.00012712 ... 0.00019843 0.00016911 0.02760296]]
next word results[0]: [[0.02989654 0.03431715 0.00012712 ... 0.00019843 0.00016911 0.02760296]]
next word results[0].shape: (1, 2073)
1
<e>
[u'firstw', u'secondw', u'thirdw', u'fourthw'] 
word embedding results[0]: [[0.02989654 0.03431715 0.00012712 ... 0.00019843 0.00016911 0.02760296]] 
word embedding results[0].shape: (1, 2073)
指派人
分配到
无
里程碑
无
分配里程碑
工时统计
无
截止日期
无
标识: paddlepaddle/book#783
渝ICP备2023009037号

京公网安备11010502055752号

网络110报警服务 Powered by GitLab CE v13.7
开源知识
Git 入门 Pro Git 电子书 在线学 Git
Markdown 基础入门 IT 技术知识开源图谱
帮助
使用手册 反馈建议 博客
《GitCode 隐私声明》 《GitCode 服务条款》 关于GitCode
Powered by GitLab CE v13.7