data_preprocessing.py 4.2 KB
Newer Older
H
He, Kai 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
mpc data preprocessing op layers.
"""
from paddle.fluid.data_feeder import check_type, check_dtype
from ..framework import check_mpc_variable_and_dtype
from ..mpc_layer_helper import MpcLayerHelper
H
He, Kai 已提交
20
from .math import reduce_sum
H
He, Kai 已提交
21 22 23

__all__ = ['mean_normalize']

H
He, Kai 已提交
24
def mean_normalize(f_min, f_max, f_mean, sample_num):
H
He, Kai 已提交
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
    '''
    Mean normalization is a method used to normalize the range of independent
    variables or features of data.
    Refer to:
    https://en.wikipedia.org/wiki/Feature_scaling#Mean_normalization

    Args:
        f_min (Variable): A 2-D tensor with shape [P, N], where P is the party
                          num and N is the feature num. Each row contains the
                          local min feature val of N features.
        f_max (Variable): A 2-D tensor with shape [P, N], where P is the party
                          num and N is the feature num. Each row contains the
                          local max feature val of N features.
        f_mean (Variable): A 2-D tensor with shape [P, N], where P is the party
                           num and N is the feature num. Each row contains the
                           local min feature val of N features.
        sample_num (Variable): A 1-D tensor with shape [P], where P is the
                               party num. Each element contains sample num
                               of party_i.

    Returns:
        f_range (Variable): A 1-D tensor with shape [N], where N is the
                            feature num. Each element contains global
                            range of feature_i.
        f_mean_out (Variable): A 1-D tensor with shape [N], where N is the
                               feature num. Each element contains global
                               range of feature_i.
    Examples:
        .. code-block:: python
            import paddle_fl.mpc as pfl_mpc

H
He, Kai 已提交
56
            pfl_mpc.init("aby3", role, "localhost", redis_server, redis_port)
H
He, Kai 已提交
57

H
He, Kai 已提交
58 59
            # 2 for share, 4 for 4 party, 100 for feat_num
            input_size = [2, 4, 100]
H
He, Kai 已提交
60

H
He, Kai 已提交
61 62 63 64
            mi = pfl_mpc.data(name='mi', shape=input_size, dtype='int64')
            ma = pfl_mpc.data(name='ma', shape=input_size, dtype='int64')
            me = pfl_mpc.data(name='me', shape=input_size, dtype='int64')
            sn = pfl_mpc.data(name='sn', shape=input_size[:-1], dtype='int64')
H
He, Kai 已提交
65

H
He, Kai 已提交
66 67
            out0, out1 = pfl_mpc.layers.mean_normalize(f_min=mi, f_max=ma,
                    f_mean=me, sample_num=sn)
H
He, Kai 已提交
68

H
He, Kai 已提交
69
            exe = fluid.Executor(place=fluid.CPUPlace())
H
He, Kai 已提交
70

H
He, Kai 已提交
71 72 73
            # feed encrypted data
            f_range, f_mean = exe.run(feed={'mi': f_min, 'ma': f_max,
            'me': f_mean, 'sn': sample_num}, fetch_list=[out0, out1])
H
He, Kai 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87
    '''
    helper = MpcLayerHelper("mean_normalize", **locals())

    # dtype = helper.input_dtype()
    dtype = 'int64'

    check_dtype(dtype, 'f_min', ['int64'], 'mean_normalize')
    check_dtype(dtype, 'f_max', ['int64'], 'mean_normalize')
    check_dtype(dtype, 'f_mean', ['int64'], 'mean_normalize')
    check_dtype(dtype, 'sample_num', ['int64'], 'mean_normalize')

    f_range = helper.create_mpc_variable_for_type_inference(dtype=f_min.dtype)
    f_mean_out= helper.create_mpc_variable_for_type_inference(dtype=f_min.dtype)

H
He, Kai 已提交
88 89
    total_num = reduce_sum(sample_num)

H
He, Kai 已提交
90 91 92 93 94 95 96 97 98
    op_type = 'mean_normalize'

    helper.append_op(
        type='mpc_' + op_type,
        inputs={
            "Min": f_min,
            "Max": f_max,
            "Mean": f_mean,
            "SampleNum": sample_num,
H
He, Kai 已提交
99
            "TotalNum": total_num,
H
He, Kai 已提交
100 101 102 103 104
            },
        outputs={
            "Range": f_range,
            "MeanOut": f_mean_out,
             },
H
He, Kai 已提交
105
        )
H
He, Kai 已提交
106 107

    return f_range, f_mean_out