# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This module provides embedding operation for paddle_mpc. """ from __future__ import print_function import six import numpy as np from paddle import fluid from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype import warnings from .framework import MpcVariable from .mpc_layer_helper import MpcLayerHelper from .data_utils import aby3 __all__ = ['embedding'] def embedding(input, size, is_sparse=False, is_distributed=False, padding_idx=None, param_attr=None, dtype='int64'): """ The operator is used to lookup embeddings vector of ids provided by :attr:`input` . It automatically constructs a 2D embedding matrix based on the input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` . The `input` is the mpc one-hot tensor of indexes, it last dimensions is equal to `emb_size`, its shape size must be 3, i.e., (2, x, emb_size) The shape of output Tensor is generated by replacing an emb_size dimension to the last dimension of the input Tensor shape. **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , otherwise the program will throw an exception and exit. ** Params of `is_sparse`, `is_distributed`, `padding_idx` have not been implemented. .. code-block:: text Case 1: input is a Tensor. input.data = aby3.make_share([[1, 0, 0], [0, 1, 0]]) input.shape = [2, 2, 3] w.data = aby3.make_share([[1, 2], [2, 3], [3, 4]]) Given size = [2, 3, 2] output is a Tensor: out.shape = [2, 2, 2] out.data.reconstruct = [[1, 2], [2, 3]] Args: input(MpcVariable): A Tensor or LoDTensor with type int64, which contains the id information. The value of the input id should satisfy :math:`0<= id < size[0]` . size(tuple|list): The shape of lookup table parameter. It should have two elements which indicates the size of the dictionary of embeddings and the size of each embedding vector respectively. is_sparse(bool, not implemented): The flag indicating whether to use sparse update. This parameter only affects the performance of the backwards gradient update. It is recommended to set True because sparse update is faster. But some optimizer does not support sparse update, such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` , :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` , :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` . In these case, is_sparse must be False. Default: False. is_distributed(bool, not implemented): Whether to store the embedding matrix in a distributed manner. Only used in multi-machine distributed CPU training. Default: False. padding_idx(int|long|None, not implemented): padding_idx needs to be in the interval [-vocab_size, vocab_size). If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup encounters :math:`padding\_idx` in id. And the padding data will not be updated while training. If set None, it makes no effect to output. Default: None. param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition, user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. The local word vector needs to be transformed into numpy format, and the shape of local word vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer` is used to load custom or pre-trained word vectors. dtype(str|core.VarDesc.VarType.INT64): It refers to the data type of output Tensor. It must be int64. Returns: Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` . Examples: .. code-block:: python import paddle.fluid as fluid import paddle_fl.mpc as pfl import numpy as np # data should be mpc one hot tensor data = pfl.data(name='x', shape=[4, 3], dtype='int64') # exampel 1 emb_1 = fluid.embedding(input=data, size=[3, 4]) # example 2: load custom or pre-trained word vectors weight_data = np.random.random(size=(2, 3, 4)) # mpc word vectors with numpy format w_param_attrs = fluid.ParamAttr( name="emb_weight", learning_rate=0.5, initializer=fluid.initializer.NumpyArrayInitializer(weight_data), trainable=True) emb_2 = fluid.embedding(input=data, size=(3, 4), param_attr=w_param_attrs, dtype='int64') """ if is_sparse: warnings.warn("the process on sparse data is the same with dense data," " that is, 'is_sparse' always set as 'False' in paddle_encrypted.") if is_distributed: warnings.warn("distributed deployment of paddle_encrypted has not been implemented." " that is, 'is_distributed' always set as 'False' in paddle_encrypted.") if padding_idx: warnings.warn("padding_idx is not supported in paddle_encrypted." " that is, 'padding_idx' always set as 'None' in paddle_encrypted.") helper = MpcLayerHelper('embedding', **locals()) check_variable_and_dtype(input, 'input', ['int64'], 'paddle_encrypted.embedding') check_dtype(dtype, 'dtype', ['int64'], 'paddle_encrypted.embedding') w = helper.create_mpc_parameter( attr=helper.param_attr, shape=size, dtype='int64', is_bias=False) tmp = helper.create_mpc_variable_for_type_inference(dtype) helper.append_op( type='mpc_lookup_table_v2', inputs={'Ids': input, 'W': w}, outputs={'Out': tmp}, attrs={ 'is_sparse': False, 'is_distributed': False, 'remote_prefetch': False, 'padding_idx': None }) return tmp