add mean normalize demo

57c82ab5 · He, Kai · e8240167 · 57c82ab5 · 57c82ab5 · 57c82ab5
5 changed file
--- a/python/paddle_fl/mpc/examples/mean_normalize_demo/README.md
+++ b/python/paddle_fl/mpc/examples/mean_normalize_demo/README.md
+## Instructions for PaddleFL-MPC Mean Normalize Demo
+
+This document introduces how to run Mean Normalize demo based on Paddle-MPC,
+which is single machine demo.
+
+### Running on Single Machine
+
+#### (1). Prepare Data
+
+Create a empty dir for data, and modify `data_path` in `process_data.py`,
+default dir path is `./data`.
+
+Then run the script with command `python prepare.py` to generate random data
+for demo. Otherwise generate your own data, move them to `data_path` and modify
+corresponding meta info in `prepare.py`.
+
+Encrypted data files of feature statstics would be generated and saved in
+`data_path` directory. Different suffix names are used for these files to
+indicate the ownership of different data source and computation parties.
+For instance, a file named `feature_max.1.part2` means it contains the max
+feature values from data owner 1 and needs to be feed to computing party 2.
+
+#### (2). Launch Demo with A Shell Script
+
+You should set the env params as follow:
+
+```
+export PYTHON=/yor/python
+export PATH_TO_REDIS_BIN=/path/to/redis_bin
+export LOCALHOST=/your/localhost
+export REDIS_PORT=/your/redis/port
+```
+
+Launch demo with the `run_standalone.sh` script. The concrete command is:
+
+```bash
+bash ../run_standalone.sh mean_normalize_demo.py
+```
+
+The ciphertext result of global feature range and feature mean will be save in
+`data_path` directory, named `result.part{i}`.
+
+#### (3). Decrypt Data
+
+Finally, using `decrypt_data()` in `process_data.py` script, this demo would
+decrypt and returns the result, which can be used to rescale local feature data
+by all data owners respectively.
+
+```python
+import prepare
+import process_data
+
+# 0 for f_range, 1 for f_mean
+# use decrypted global f_range and f_mean to rescaling local feature data
+res = process_data.decrypt_data(prepare.data_path + 'result', (2, prepare.feat_width, ))
+```
+
+Also, `verify.py` could be used to calculate error between direct plaintext
+numpy calculation and mpc mean normalize.
--- a/python/paddle_fl/mpc/examples/mean_normalize_demo/mean_normalize_demo.py
+++ b/python/paddle_fl/mpc/examples/mean_normalize_demo/mean_normalize_demo.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Mean normalize demo.
+"""
+import sys
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle_fl.mpc as pfl_mpc
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import prepare
+import process_data
+
+role, server, port = sys.argv[1], sys.argv[2], sys.argv[3]
+role, port = int(role), int(port)
+
+share_num = aby3.ABY3_SHARE_DIM
+
+party_num = len(prepare.sample_nums)
+
+feat_num = prepare.feat_width
+
+data_path = prepare.data_path
+
+def get_shares(path):
+    '''
+    collect encrypted feature stats from all data owners
+    '''
+    data = []
+    for i in range(party_num):
+        reader = aby3.load_aby3_shares(path + '.' + str(i),
+                id=role, shape=(feat_num,))
+        data.append([x for x in reader()])
+    data = np.array(data).reshape([party_num, share_num, feat_num])
+    return np.transpose(data, axes=[1, 0, 2])
+
+
+def get_sample_num(path):
+    '''
+    get encrypted sample nums
+    '''
+    reader = aby3.load_aby3_shares(path,
+            id=role, shape=(party_num,))
+    for n in reader():
+        return n
+
+
+f_max = get_shares(data_path + 'feature_max')
+f_min = get_shares(data_path + 'feature_min')
+f_mean = get_shares(data_path + 'feature_mean')
+sample_num = get_sample_num(data_path + 'sample_num')
+
+pfl_mpc.init("aby3", int(role), "localhost", server, int(port))
+
+shape = [party_num, feat_num]
+
+mi = pfl_mpc.data(name='mi', shape=shape, dtype='int64')
+ma = pfl_mpc.data(name='ma', shape=shape, dtype='int64')
+me = pfl_mpc.data(name='me', shape=shape, dtype='int64')
+sn = pfl_mpc.data(name='sn', shape=shape[:-1], dtype='int64')
+
+out0, out1 = pfl_mpc.layers.mean_normalize(f_min=mi, f_max=ma,
+        f_mean=me, sample_num=sn)
+
+exe = fluid.Executor(place=fluid.CPUPlace())
+
+f_range, f_mean = exe.run(feed={'mi': f_min, 'ma': f_max, 'me': f_mean,
+    'sn': sample_num},fetch_list=[out0, out1])
+result = np.transpose(np.array([f_range, f_mean]), axes=[1, 0, 2])
+
+result_file = data_path + "result.part{}".format(role)
+with open(result_file, 'wb') as f:
+    f.write(result.tostring())
--- a/python/paddle_fl/mpc/examples/mean_normalize_demo/prepare.py
+++ b/python/paddle_fl/mpc/examples/mean_normalize_demo/prepare.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Prepare data for mean normalize demo.
+"""
+import numpy as np
+import process_data
+from paddle_fl.mpc.data_utils import aby3
+
+data_path = process_data.data_path
+
+feat_width = 100
+# assume data owner i has sample_nums[i] samples
+sample_nums = [1, 2, 3, 4]
+
+def gen_random_data():
+
+    for i, num in enumerate(sample_nums):
+        suffix = '.' + str(i)
+
+        f_mat = np.random.rand(num, feat_width)
+        np.save(data_path + 'feature_data' + suffix, f_mat)
+
+        process_data.generate_encrypted_data(i, f_mat)
+
+    aby3.save_aby3_shares(process_data.encrypted_data(np.array(sample_nums)),
+            data_path + 'sample_num')
+
+if __name__ == "__main__":
+    gen_random_data()
--- a/python/paddle_fl/mpc/examples/mean_normalize_demo/process_data.py
+++ b/python/paddle_fl/mpc/examples/mean_normalize_demo/process_data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Process data for mean normalize demo.
+"""
+import numpy as np
+import six
+import os
+import paddle
+from paddle_fl.mpc.data_utils import aby3
+
+data_path = './data/'
+
+
+def encrypted_data(data):
+    """
+    feature stat reader
+    """
+    def func():
+        yield aby3.make_shares(data)
+    return func
+
+
+def generate_encrypted_data(party_id, f_mat):
+    """
+    generate encrypted data from feature matrix (np.array)
+    """
+
+    f_max  = np.amax(f_mat, axis=0)
+    f_min  = np.amin(f_mat, axis=0)
+    f_mean = np.mean(f_mat, axis=0)
+
+    suffix = '.' + str(party_id)
+
+    aby3.save_aby3_shares(encrypted_data(f_max),
+            data_path + "feature_max" + suffix)
+    aby3.save_aby3_shares(encrypted_data(f_min),
+            data_path + "feature_min" + suffix)
+    aby3.save_aby3_shares(encrypted_data(f_mean),
+            data_path + "feature_mean" + suffix)
+
+
+def decrypt_data(filepath, shape):
+    """
+    load the encrypted data and reconstruct
+    """
+    part_readers = []
+    for id in six.moves.range(3):
+        part_readers.append(
+            aby3.load_aby3_shares(
+                filepath, id=id, shape=shape))
+    aby3_share_reader = paddle.reader.compose(part_readers[0], part_readers[1],
+                                              part_readers[2])
+
+    for instance in aby3_share_reader():
+        p = aby3.reconstruct(np.array(instance))
+        return p
--- a/python/paddle_fl/mpc/examples/mean_normalize_demo/verify.py
+++ b/python/paddle_fl/mpc/examples/mean_normalize_demo/verify.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Verification for mean normalize demo.
+"""
+import prepare
+import process_data
+import numpy as np
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+# 0 for f_range, 1 for f_mean
+# use decrypted global f_range and f_mean to rescaling local feature data
+res = process_data.decrypt_data(prepare.data_path + 'result', (2, prepare.feat_width, ))
+
+# reconstruct plaintext global data to verify
+row, col = sum(prepare.sample_nums), prepare.feat_width
+plain_mat = np.empty((row, col))
+
+row = 0
+for i,num in enumerate(prepare.sample_nums):
+    m = np.load(prepare.data_path+'feature_data.' + str(i) + '.npy')
+    plain_mat[row:row+num] = m
+    row += num
+
+def mean_normalize(f_mat):
+    '''
+    get plain text f_range & f_mean
+    '''
+    ma = np.amax(f_mat, axis=0)
+    mi = np.amin(f_mat, axis=0)
+
+    return ma - mi, np.mean(f_mat, axis=0)
+
+plain_range, plain_mean = mean_normalize(plain_mat)
+
+print("max error in featrue range:", np.max(np.abs(res[0] - plain_range)))
+print("max error in featrue mean:", np.max(np.abs(res[1] - plain_mean)))