embedder.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
################################################################################
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

from __future__ import division

import numpy as np
import paddle.fluid as fluid

import mmpms.layers as layers


class Embedder(layers.Embedding):
    def __init__(self,
                 num_embeddings,
                 embedding_dim,
                 is_sparse=False,
                 is_distributed=False,
                 padding_idx=None,
                 param_attr=None,
                 dtype='float32',
                 name=None):
        super(Embedder, self).__init__(
            size=[num_embeddings, embedding_dim],
            is_sparse=is_sparse,
            is_distributed=is_distributed,
            padding_idx=padding_idx,
            param_attr=param_attr,
            dtype=dtype,
            name=name)
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim

    def from_pretrained(self, embeds, place, scale=0.05):
        assert len(embeds) == self.num_embeddings
        assert len(embeds[0]) == self.embedding_dim

        embeds = np.array(embeds, dtype='float32')
        num_known = 0
        for i in range(len(embeds)):
            if np.all(embeds[i] == 0):
                embeds[i] = np.random.uniform(
                    low=-scale, high=scale, size=self.embedding_dim)
            else:
                num_known += 1
        if self.padding_idx is not None:
            embeds[self.padding_idx] = 0

        embedding_param = fluid.global_scope().find_var(
            self.param_attr.name).get_tensor()
        embedding_param.set(embeds, place)

        print("{} words have pretrained embeddings ".format(num_known) +
              "(coverage: {:.3f})".format(num_known / self.num_embeddings))