// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include "paddle/fluid/platform/enforce.h" #include "prng.h" namespace aby3 { template FixedPointTensor::FixedPointTensor(TensorAdapter* share_tensor[2]) { // TODO: check tensors' shapes _share[0] = share_tensor[0]; _share[1] = share_tensor[1]; } template FixedPointTensor::FixedPointTensor(TensorAdapter* share_tensor_0, TensorAdapter* share_tensor_1) { // TODO: check tensors' shapes _share[0] = share_tensor_0; _share[1] = share_tensor_1; } template TensorAdapter* FixedPointTensor::mutable_share(size_t idx) { PADDLE_ENFORCE_LT(idx, 2, "Input should be less than 2."); return _share[idx]; } template const TensorAdapter* FixedPointTensor::share(size_t idx) const { PADDLE_ENFORCE_LT(idx, 2, "Input should be less than 2."); return _share[idx]; } // reveal fixedpointtensor to one party template void FixedPointTensor::reveal_to_one(size_t party, TensorAdapter* ret) const { if (party == this->party()) { // TODO: check if tensor shape equal auto buffer = tensor_factory()->template create(ret->shape()); aby3_ctx()->network()->template recv(pre_party(), *buffer); share(0)->add(buffer.get(), ret); share(1)->add(ret, ret); ret->scaling_factor() = N; } else if (party == next_party()) { aby3_ctx()->network()->template send(party, *share(0)); } } // reveal fixedpointtensor to all parties template void FixedPointTensor::reveal(TensorAdapter* ret) const { for (size_t i = 0; i < 3; ++i) { reveal_to_one(i, ret); } } template const std::vector FixedPointTensor::shape() const { return _share[0]->shape(); } //convert TensorAdapter to shares template void FixedPointTensor::share(const TensorAdapter* input, TensorAdapter* output_shares[3], block seed) { if (equals(seed, g_zero_block)) { seed = block_from_dev_urandom(); } //set seed of prng[2] aby3_ctx()->set_random_seed(seed, 2); aby3_ctx()->template gen_random_private(*output_shares[0]); aby3_ctx()->template gen_random_private(*output_shares[1]); auto temp = tensor_factory()->template create(input->shape()); output_shares[0]->add(output_shares[1], temp.get()); input->sub(temp.get(), output_shares[2]); for (int i = 0; i < 3; ++i) { output_shares[i]->scaling_factor() = input->scaling_factor(); } } template void FixedPointTensor::add(const FixedPointTensor* rhs, FixedPointTensor* ret) const { _share[0]->add(rhs->_share[0], ret->_share[0]); _share[1]->add(rhs->_share[1], ret->_share[1]); } template void FixedPointTensor::add(const TensorAdapter* rhs, FixedPointTensor* ret) const { PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(), "no match scaling factor"); if (party() == 0) { _share[0]->add(rhs, ret->_share[0]); _share[1]->copy(ret->_share[1]); } else if (party() == 1) { _share[0]->copy(ret->_share[0]); _share[1]->copy(ret->_share[1]); } else { _share[0]->copy(ret->_share[0]); _share[1]->add(rhs, ret->_share[1]); } } template void FixedPointTensor::sub(const FixedPointTensor* rhs, FixedPointTensor* ret) const { _share[0]->sub(rhs->_share[0], ret->_share[0]); _share[1]->sub(rhs->_share[1], ret->_share[1]); } template void FixedPointTensor::sub(const TensorAdapter* rhs, FixedPointTensor* ret) const { PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(), "no match scaling factor"); if (party() == 0) { _share[0]->sub(rhs, ret->_share[0]); _share[1]->copy(ret->_share[1]); } else if (party() == 1) { _share[0]->copy(ret->_share[0]); _share[1]->copy(ret->_share[1]); } else { _share[0]->copy(ret->_share[0]); _share[1]->sub(rhs, ret->_share[1]); } } template void FixedPointTensor::negative(FixedPointTensor* ret) const { _share[0]->negative(ret->_share[0]); _share[1]->negative(ret->_share[1]); } template void FixedPointTensor::mul(const FixedPointTensor* rhs, FixedPointTensor* ret) const { mul_trunc(this, rhs, ret, &TensorAdapter::mul); } #ifdef USE_ABY3_TRUNC1 //use aby3 trunc1 template void FixedPointTensor::truncate(const FixedPointTensor* op, FixedPointTensor* ret, size_t scaling_factor) { if (scaling_factor == 0) { op->share(0)->copy(ret->mutable_share(0)); op->share(1)->copy(ret->mutable_share(1)); } // implement ABY3's truncate1 algorithm if (party() == 0) { // party0 op->_share[0]->rshift(scaling_factor, ret->_share[0]); aby3_ctx()->network()->template recv(1, *(ret->_share[1])); } else if (party() == 1) { // party1 auto r_12 = tensor_factory()->template create(op->shape()); aby3_ctx()->template gen_random(*r_12.get(), true); op->_share[0]->add(op->_share[1], ret->_share[0]); // trunc from [SecureML, Thm.1] ret->_share[0]->negative(ret->_share[0]); ret->_share[0]->rshift(scaling_factor, ret->_share[0]); ret->_share[0]->negative(ret->_share[0]); ret->_share[0]->sub(r_12.get(), ret->_share[0]); aby3_ctx()->network()->template send(0, *(ret->_share[0])); r_12->copy(ret->_share[1]); } else { // party2 op->_share[1]->rshift(scaling_factor, ret->_share[1]); auto r_21 = tensor_factory()->template create(op->shape()); aby3_ctx()->template gen_random(*r_21.get(), false); r_21->copy(ret->_share[0]); } return; } #else // use truncate3 // Protocol. `truncate3` // motivation: // truncates in aby3 may cause msb error with small probability // the reason is that before rishft op, its masked value e.g., x' - r' may overflow in int64 // so that, in `truncate3`, we limit r' in (-2^62, 2^62) to avoid the problem. // notice: // when r' is contrainted in (-2^62, 2^62), // the SD (statistical distance) of x' - r' between this // and r' in Z_{2^64} is equal to |X| / (2^63 + |X|) // detail protocol: // P2 randomly generates r' \in (-2^62, 2^62), randomly generates r'_0, r_0, r_1 in Z_{2^64}, // P2 compute r'_1 = r' - r'_0, r_2 = r'/2^N - r_0 - r_1, let x2 = r_2 // P2 send r_0, r'_0 to P0, send r_1, r'_1 to P1 // P1 and P0 execute "reveal x - r' to P1" // P1 compute x1 = (x - r') / 2^N + r_1 // P0 set x0 = r_0 // P0, P1, P2 invoke reshare() with inputs x0, x1, x2 respectively. template void FixedPointTensor::truncate(const FixedPointTensor* op, FixedPointTensor* ret, size_t scaling_factor) { if (scaling_factor == 0) { op->share(0)->copy(ret->mutable_share(0)); op->share(1)->copy(ret->mutable_share(1)); return; } std::vector>> temp; if (party() == 2) { for (int i = 0; i < 7; ++i) { temp.emplace_back( tensor_factory()->template create(op->shape())); } // r', contraint in (-2^62, 2^62) aby3_ctx()->template gen_random_private(*temp[0]); int64_t contraint_upper = ~((uint64_t) 1 << 62); int64_t contraint_low = (uint64_t) 1 << 62; std::for_each(temp[0]->data(), temp[0]->data() + temp[0]->numel(), [&contraint_upper, &contraint_low] (T& a) { // contraint -2^62 < a < 2^62 if (a >= 0) { a &= contraint_upper; } else { a |= contraint_low; } }); //r'_0, r'_1 aby3_ctx()->template gen_random_private(*temp[1]); temp[0]->sub(temp[1].get(), temp[2].get()); // r, r_0, r_1 temp[0]->rshift(scaling_factor, temp[3].get()); aby3_ctx()->template gen_random_private(*temp[4]); aby3_ctx()->template gen_random_private(*temp[5]); // r_2 temp[3]->sub(temp[4].get(), temp[6].get()); temp[6]->sub(temp[5].get(), temp[6].get()); aby3_ctx()->network()->template send(1, *temp[2]); aby3_ctx()->network()->template send(1, *temp[5]); aby3_ctx()->network()->template send(0, *temp[1]); aby3_ctx()->network()->template send(0, *temp[4]); temp[6]->copy(ret->mutable_share(0)); } else if (party() == 1) { for (int i = 0; i < 4; ++i) { temp.emplace_back( tensor_factory()->template create(op->shape())); } // r'_1, r_1 aby3_ctx()->network()->template recv(2, *temp[0]); aby3_ctx()->network()->template recv(2, *temp[1]); // recv x0 - r'_0 from party 0 aby3_ctx()->network()->template recv(0, *temp[2]); //reveal x - r' to party 1 op->share(0)->add(op->share(1), temp[3].get()); temp[3]->add(temp[2].get(), temp[3].get()); temp[3]->sub(temp[0].get(), temp[3].get()); // truncate x-r' temp[3]->rshift(scaling_factor, temp[3].get()); temp[3]->add(temp[1].get(), ret->mutable_share(0)); } else { for (int i = 0; i < 3; ++i) { temp.emplace_back( tensor_factory()->template create(op->shape())); } // r'_0, r_0 aby3_ctx()->network()->template recv(2, *temp[0]); aby3_ctx()->network()->template recv(2, *temp[1]); //send x0 - r'_0 to party 1 op->share(0)->sub(temp[0].get(), temp[2].get()); aby3_ctx()->network()->template send(1, *temp[2]); temp[1]->copy(ret->mutable_share(0)); } reshare(ret->share(0), ret->mutable_share(1)); // compensation for carry in auto tensor_carry_in = tensor_factory()->template create(ret->shape()); assign_to_tensor(tensor_carry_in.get(), (T)1); tensor_carry_in->scaling_factor() = N; ret->add(tensor_carry_in.get(), ret); } #endif //USE_ABY3_TRUNC1 template template void FixedPointTensor::mul_trunc(const FixedPointTensor* lhs, const FixedPointTensor* rhs, FixedPointTensor* ret, MulFunc mul_func) { auto r_zero = tensor_factory()->template create(ret->shape()); aby3_ctx()->gen_zero_sharing_arithmetic(*r_zero.get()); // temp = _share[0]->mul(rhs->_share[0]) + // _share[0]->mul(rhs->_share[1]) + // _share[1]->mul(rhs->_share[0]) + // r_zero auto temp = tensor_factory()->template create(ret->shape()); auto temp1 = tensor_factory()->template create(ret->shape()); // use mul_func to fit both element_wise mul and mat mul (lhs->share(0)->*mul_func)(rhs->share(0), temp.get()); (lhs->share(0)->*mul_func)(rhs->share(1), temp1.get()); temp1->add(temp.get(), temp1.get()); (lhs->share(1)->*mul_func)(rhs->share(0), temp.get()); temp1->add(r_zero.get(), temp1.get()); temp->add(temp1.get(), temp.get()); auto temp2 = tensor_factory()->template create(ret->shape()); auto temp3 = tensor_factory()->template create(ret->shape()); TensorAdapter* temp_array[2] = {temp2.get(), temp3.get()}; std::shared_ptr> ret_no_trunc = std::make_shared>(temp_array); temp->copy(ret_no_trunc->_share[0]); reshare(temp.get(), ret_no_trunc->_share[1]); truncate(ret_no_trunc.get(), ret, N); } template void FixedPointTensor::mul(const TensorAdapter* rhs, FixedPointTensor* ret) const { // PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(), // "no match scaling factor"); auto temp0 = tensor_factory()->template create(this->shape()); auto temp1 = tensor_factory()->template create(this->shape()); std::shared_ptr> temp = std::make_shared>(temp0.get(), temp1.get()); _share[0]->mul(rhs, temp->_share[0]); _share[1]->mul(rhs, temp->_share[1]); truncate(temp.get(), ret, rhs->scaling_factor()); } template void FixedPointTensor::sum(FixedPointTensor* ret) const { PADDLE_ENFORCE_EQ(ret->numel(), 1, "output size should be 1."); T sum1 = (T) 0; T sum2 = (T) 0; T* iter_0 = _share[0]->data(); T* iter_1 = _share[1]->data(); for (int i = 0; i < this->numel(); ++i) { sum1 += *(iter_0 + i); sum2 += *(iter_1 + i); } assign_to_tensor(ret->_share[0], sum1); assign_to_tensor(ret->_share[1], sum2); } template template class CTensor, size_t... N1> void FixedPointTensor::dot_mul(const CTensor* rhs, FixedPointTensor* ret) const { PADDLE_ENFORCE_EQ(ret->numel(), 1, "output size should be 1."); auto temp0 = tensor_factory()->template create(this->shape()); auto temp1 = tensor_factory()->template create(this->shape()); std::shared_ptr> temp = std::make_shared>(temp0.get(), temp1.get()); this->mul(rhs, temp.get()); temp->sum(ret); } template void FixedPointTensor::mat_mul(const FixedPointTensor* rhs, FixedPointTensor* ret) const { mul_trunc(this, rhs, ret, &TensorAdapter::mat_mul); } template void FixedPointTensor::mat_mul(const TensorAdapter* rhs, FixedPointTensor* ret) const { _share[0]->mat_mul(rhs, ret->_share[0]); _share[1]->mat_mul(rhs, ret->_share[1]); truncate(ret, ret, rhs->scaling_factor()); } template< typename T, size_t N> void FixedPointTensor::div(const TensorAdapter* rhs, FixedPointTensor* ret) const { PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(), "no match scaling factor"); auto temp = tensor_factory()->template create(this->shape()); double scale = std::pow(2, rhs->scaling_factor()); auto inverse = [scale](T d) -> T { return 1.0 * scale / d * scale; }; std::transform(rhs->data(), rhs->data() + rhs->numel(), temp->data(), inverse); temp->scaling_factor() = rhs->scaling_factor(); this->mul(temp.get(), ret); } template void FixedPointTensor::div(const FixedPointTensor* rhs, FixedPointTensor* ret, size_t iter, double x0) const { auto temp0 = tensor_factory()->template create(ret->shape()); auto temp1 = tensor_factory()->template create(ret->shape()); std::shared_ptr> temp = std::make_shared>(temp0.get(), temp1.get()); reciprocal(rhs, temp.get(), iter, x0); this->mul(temp.get(), ret); } template void FixedPointTensor::exp(FixedPointTensor* ret, size_t iter) const { // exp approximate: exp(x) = \lim_{n->inf} (1+x/n)^n // where n = 2^ite auto pow_iter = tensor_factory()->template create(this->shape()); assign_to_tensor(pow_iter.get(), (T) (pow(2, N -iter))); pow_iter->scaling_factor() = N; auto tensor_one = tensor_factory()->template create(this->shape()); assign_to_tensor(tensor_one.get(), (T) 1 << N); tensor_one->scaling_factor() = N; this->mul(pow_iter.get(), ret); ret->add(tensor_one.get(), ret); for (int i = 0; i < iter; ++i) { ret->mul(ret, ret); } } template< typename T, size_t N> void FixedPointTensor::relu(FixedPointTensor* ret) const { //utilize polynomial_piecewise // break_point = {0}, coeff[0] = {0, 0}, coeff[1] = {0, 1} // break_point.shape = {1, this->shape}, coeff.shape = {2, 2, this->shape} auto shape_ = shape(); //construct break_point auto b_shape = shape_; b_shape.insert(b_shape.begin(), 1); auto break_point = tensor_factory()->template create(b_shape); T* b_ptr = break_point->data(); for (size_t i = 0; i < break_point->numel(); ++i) { b_ptr[i] = 0; } break_point->scaling_factor() = N; //contruct coeff std::vector c_shape = {2, 2}; c_shape.insert(c_shape.end(), shape_.begin(), shape_.end()); auto coeff = tensor_factory()->template create(c_shape); T* c_ptr = coeff->data(); for (size_t i = 0; i < 3 * this->numel(); ++i) { c_ptr[i] = 0; } for (size_t i = 3 * this->numel(); i < 4 * this->numel(); ++i) { c_ptr[i] = (T) 1 << N; } coeff->scaling_factor() = N; this->polynomial_piecewise(coeff.get(), break_point.get(), ret); } template< typename T, size_t N> void FixedPointTensor::relu_with_derivative( FixedPointTensor* ret, BooleanTensor* derivative) const { auto shape_ = shape(); auto zero = tensor_factory()->template create(shape_); assign_to_tensor(zero.get(), (T)0); zero->scaling_factor() = N; auto tmp0 = tensor_factory()->template create(shape_); auto tmp1 = tensor_factory()->template create(shape_); BooleanTensor der(tmp0.get(), tmp1.get()); gt(zero.get(), &der); der.mul(this, ret); if (derivative) { der.share(0)->copy(derivative->share(0)); der.share(1)->copy(derivative->share(1)); } } template< typename T, size_t N> void FixedPointTensor::sigmoid_chebyshev(FixedPointTensor* ret) const { //utilize Chebyshev polynomial approximation // more accurate in small range, such as [-4, 4] auto shape = ret->shape(); std::vector shape_ = shape; shape_.insert(shape_.begin(), 10); auto numel = ret->numel(); auto coeff = tensor_factory()->template create(shape_); std::vector w; w.resize(10, 0.0f); w[0] = 0.5; w[1] = 0.2159198015; w[3] = -0.0082176259; w[5] = 0.0001825597; w[7] = -0.0000018848; w[9] = 0.0000000072; for (int i = 0; i < 10; ++i) { for (int j = 0; j < numel; ++j) { *(coeff->data() + i * numel + j) = (T) (w[i] * pow(2, N)); } } coeff->scaling_factor() = N; polynomial(coeff.get(), ret); } template< typename T, size_t N> void FixedPointTensor::sigmoid(FixedPointTensor* ret) const { //utilize polynomial_piecewise // break_point = {-2.5, 2.5} // coeff[0] = {10^-4, 0}, coeff[1] = {0.5, 0.17} // coeff[2] = {1 - 10^-4, 0} // break_point.shape = {2, this->shape}, coeff.shape = {3, 2, this->shape} //construct break_point auto shape_ = shape(); //construct break_point auto b_shape = shape_; b_shape.insert(b_shape.begin(), 2); auto break_point = tensor_factory()->template create(b_shape); T* b_ptr = break_point->data(); for (size_t i = 0; i < break_point->numel(); ++i) { b_ptr[i] = 0; } for (size_t i = 0; i < break_point->numel() / 2; ++i) { b_ptr[i] = (T) (-2.5 * pow(2, N)); } for (size_t i = break_point->numel() / 2; i < break_point->numel(); ++i) { b_ptr[i] = (T) (2.5 * pow(2, N)); } break_point->scaling_factor() = N; //contruct coeff std::vector c_shape = {3, 2}; c_shape.insert(c_shape.end(), shape_.begin(), shape_.end()); auto coeff = tensor_factory()->template create(c_shape); T* c_ptr = coeff->data(); size_t numel = this->numel(); double scale = std::pow(2, N); for (size_t i = 0; i < numel; ++i) { c_ptr[i] = 0.0001 * scale; c_ptr[i + numel] = 0; c_ptr[i + 2 * numel] = 0.5 * scale; c_ptr[i + 3 * numel] = 0.17 * scale; c_ptr[i + 4 * numel] = (1 - 0.0001) * scale; c_ptr[i + 5 * numel] = 0; } coeff->scaling_factor() = N; this->polynomial_piecewise(coeff.get(), break_point.get(), ret); } template< typename T, size_t N> void FixedPointTensor::sigmoid_enhanced(FixedPointTensor* ret) const { //utilize polynomial_piecewise // break_point = {-5, -2.5, 2.5, 5} // coeff[0] = {10^-4, 0}, coeff[1] = {0.145, 0.02776} // coeff[2] = {0.5, 0.17}, coeff[3] = {0.85498, 0.02776}, coeff[4] = {0.9999, 0} // break_point.shape = {4, this->shape}, coeff.shape = {5, 2, this->shape} //construct break_point auto shape_ = shape(); //construct break_point auto b_shape = shape_; b_shape.insert(b_shape.begin(), 4); auto break_point = tensor_factory()->template create(b_shape); T* b_ptr = break_point->data(); auto numel = ret->numel(); double scale = std::pow(2, N); for (size_t i = 0; i < numel; ++i) { b_ptr[i] = (T) (-5 * scale); b_ptr[i + numel] = (T) (-2.5 * scale); b_ptr[i + 2 * numel] = (T) (2.5 * scale); b_ptr[i + 3 * numel] = (T) (5 * scale); } break_point->scaling_factor() = N; //contruct coeff std::vector c_shape = {5, 2}; c_shape.insert(c_shape.end(), shape_.begin(), shape_.end()); auto coeff = tensor_factory()->template create(c_shape); T* c_ptr = coeff->data(); for (size_t i = 0; i < numel; ++i) { c_ptr[i] = 0.0001 * scale; c_ptr[i + numel] = 0; c_ptr[i + 2 * numel] = 0.145 * scale; c_ptr[i + 3 * numel] = 0.02776 * scale; c_ptr[i + 4 * numel] = 0.5 * scale; c_ptr[i + 5 * numel] = 0.17 * scale; c_ptr[i + 6 * numel] = 0.85498 * scale; c_ptr[i + 7 * numel] = 0.02776 * scale; c_ptr[i + 8 * numel] = 0.9999 * scale; c_ptr[i + 9 * numel] = 0 * scale; } coeff->scaling_factor() = N; this->polynomial_piecewise(coeff.get(), break_point.get(), ret); } template< typename T, size_t N> void FixedPointTensor::softmax(FixedPointTensor* ret, bool use_relu, bool use_long_div) const { // softmax axis = -1 const size_t col = *(shape().end() - 1); const size_t row = numel() / col; std::vector>> temp; // 11 for allocating temp tensor for (size_t i = 0; i < 11; ++i) { temp.emplace_back( tensor_factory()->template create()); } temp[0]->reshape({row, col}); temp[1]->reshape({row, col}); FixedPointTensor x(temp[0].get(), temp[1].get()); if (!use_relu) { temp[2]->reshape({col, row}); temp[3]->reshape({col, row}); temp[4]->reshape({1, row}); temp[5]->reshape({1, row}); } FixedPointTensor x_t(temp[2].get(), temp[3].get()); FixedPointTensor max_x_t(temp[4].get(), temp[5].get()); temp[6]->reshape({row, 1}); temp[7]->reshape({row, 1}); FixedPointTensor max_x(temp[6].get(), temp[7].get()); temp[8]->reshape({row, col}); temp[9]->reshape({row, col}); FixedPointTensor max_x_broadcast(temp[8].get(), temp[9].get()); temp[10]->reshape({row, col}); auto exp_lower_bound = temp[10].get(); auto transpose = [](const TensorAdapter* in, TensorAdapter* out) { // suppose input dims = 2 const size_t col = in->shape()[1]; const size_t row = in->shape()[0]; const size_t numel = in->numel(); for (size_t k = 0; k < numel; ++k) { size_t i = k / row; size_t j = k % row; out->data()[k] = in->data()[j * col + i]; } }; auto broadcast = [](const TensorAdapter* in, TensorAdapter* out) { // suppose input dims = 2 // in shape = [row, 1] const size_t col = out->shape()[1]; const size_t row = out->shape()[0]; for (size_t k = 0; k < out->numel(); ++k) { size_t i = k / col; out->data()[k] = in->data()[i]; } }; share(0)->copy(x.mutable_share(0)); share(1)->copy(x.mutable_share(1)); if (use_relu) { x.relu(&x); } else { // use exp transpose(x.share(0), x_t.mutable_share(0)); transpose(x.share(1), x_t.mutable_share(1)); // x = max(input - max(input), exp_lower_bound) x_t.max_pooling(&max_x_t); transpose(max_x_t.share(0), max_x.mutable_share(0)); transpose(max_x_t.share(1), max_x.mutable_share(1)); broadcast(max_x.share(0), max_x_broadcast.mutable_share(0)); broadcast(max_x.share(1), max_x_broadcast.mutable_share(1)); x.sub(&max_x_broadcast, &x); // n = 64, see exp assign_to_tensor(exp_lower_bound, (T)(-64 * (1 << N))); exp_lower_bound->scaling_factor() = N; x.sub(exp_lower_bound, &x); x.relu(&x); x.add(exp_lower_bound, &x); x.exp(&x); } // reuse max_x as sum reduce(&x, &max_x); if (!use_long_div) { // invert sum by Newton's method // divisor range = [1/col, 1.0] // TODO: find better iter num & init val reciprocal(&max_x, &max_x, 16, 0.5 / col); } broadcast(max_x.share(0), max_x_broadcast.mutable_share(0)); broadcast(max_x.share(1), max_x_broadcast.mutable_share(1)); if (use_long_div) { x.long_div(&max_x_broadcast, &x, 1); } else { x.mul(&max_x_broadcast, &x); } x.share(0)->copy(ret->mutable_share(0)); x.share(1)->copy(ret->mutable_share(1)); } template void FixedPointTensor::long_div(const FixedPointTensor* rhs, FixedPointTensor* ret, size_t int_len) const { std::vector>> temp; for (int i = 0; i < 16; ++i) { temp.emplace_back( tensor_factory()->template create(ret->shape())); } BooleanTensor sign_lhs(temp[0].get(), temp[1].get()); BooleanTensor sign_rhs(temp[2].get(), temp[3].get()); BooleanTensor sign_ret(temp[4].get(), temp[5].get()); FixedPointTensor abs_lhs(temp[6].get(), temp[7].get()); FixedPointTensor abs_rhs(temp[8].get(), temp[9].get()); FixedPointTensor sub_rhs(temp[10].get(), temp[11].get()); BooleanTensor cmp_res(temp[12].get(), temp[13].get()); BooleanTensor cmp_res_all(temp[14].get(), temp[15].get()); assign_to_tensor(cmp_res_all.share(0), (T)0); assign_to_tensor(cmp_res_all.share(1), (T)0); const size_t msb = sizeof(T) * 8 - 1; sign_lhs.bit_extract(msb, this); sign_rhs.bit_extract(msb, rhs); sign_lhs.bitwise_xor(&sign_rhs, &sign_ret); auto lshift = [] (const FixedPointTensor* in, size_t rhs, FixedPointTensor* out) { in->share(0)->lshift(rhs, out->mutable_share(0)); in->share(1)->lshift(rhs, out->mutable_share(1)); }; // abs = val - 2 * sign * val auto abs = [lshift] (const FixedPointTensor* in, const BooleanTensor* sign, FixedPointTensor* out) { lshift(in, 1, out); sign->mul(out, out); in->sub(out, out); }; auto out0 = tensor_factory()->template create(ret->shape()); abs(this, &sign_lhs, &abs_lhs); abs(rhs, &sign_rhs, &abs_rhs); for (ssize_t i = int_len - 1; i >= 0; --i) { lshift(&abs_rhs, i, &sub_rhs); abs_lhs.gt(&sub_rhs, &cmp_res); cmp_res.mul(&sub_rhs, &sub_rhs); cmp_res.lshift(N + i, &cmp_res); abs_lhs.sub(&sub_rhs, &abs_lhs); cmp_res.bitwise_xor(&cmp_res_all, &cmp_res_all); } for (size_t i = 1; i <= N; ++i) { truncate(&abs_rhs, &sub_rhs, i); abs_lhs.gt(&sub_rhs, &cmp_res); cmp_res.mul(&sub_rhs, &sub_rhs); cmp_res.lshift(N - i, &cmp_res); abs_lhs.sub(&sub_rhs, &abs_lhs); cmp_res.bitwise_xor(&cmp_res_all, &cmp_res_all); } // use abs_lhs as buffer cmp_res_all.b2a(&abs_lhs); abs(&abs_lhs, &sign_ret, ret); } // reduce last dim template void FixedPointTensor::reduce(FixedPointTensor* input, FixedPointTensor* ret) { //enfoce shape: input->shape[0 ... (n-2)] == ret shape auto& shape = input->shape(); size_t ite_size = shape[shape.size() - 1]; T* ret_begin_ptr_0 = ret->_share[0]->data(); T* ret_begin_ptr_1 = ret->_share[1]->data(); T* input_begin_ptr_0 = input->_share[0]->data(); T* input_begin_ptr_1 = input->_share[1]->data(); for (int j = 0; j < ret->numel(); ++j) { *(ret_begin_ptr_0 + j) = *(input_begin_ptr_0 + j * ite_size); *(ret_begin_ptr_1 + j) = *(input_begin_ptr_1 + j * ite_size); for (int i = 1; i < ite_size; ++i) { *(ret_begin_ptr_0 + j) += *(input_begin_ptr_0 + j * ite_size + i); *(ret_begin_ptr_1 + j) += *(input_begin_ptr_1 + j * ite_size + i); } } } template< typename T, size_t N> void FixedPointTensor::polynomial(const TensorAdapter* coeff, FixedPointTensor* ret) const { // e.g., x.shape = {2, 3}, coeff.shape = {n, 2, 3} (n: polynomial power) //TODO: improve performance: [ABY3] std::vector>> temp; for (int i = 0; i < 7; ++i) { temp.emplace_back( tensor_factory()->template create(this->shape())); } std::shared_ptr> x_pow_i = std::make_shared>( temp[0].get(), temp[1].get()); std::shared_ptr> temp_fixed = std::make_shared>( temp[2].get(), temp[3].get()); std::shared_ptr> result = std::make_shared>( temp[5].get(), temp[6].get()); assign_to_tensor(result->_share[0], (T) 0); assign_to_tensor(result->_share[1], (T) 0); //x_pow_i.get() = 1; assign_to_tensor(x_pow_i.get()->_share[0], (T) 0); assign_to_tensor(x_pow_i.get()->_share[1], (T) 0); assign_to_tensor(temp[4].get(), (T) 1 << N); temp[4]->scaling_factor() = N; x_pow_i->add(temp[4].get(), x_pow_i.get()); for (int i = 0; i < coeff->shape()[0]; ++i) { auto t = tensor_factory()->template create(); coeff->slice(i, i + 1, t.get()); auto t_shape = t->shape(); // remove leading 1 t_shape.erase(t_shape.begin()); t->reshape(t_shape); x_pow_i->mul(t.get(), temp_fixed.get()); result->add(temp_fixed.get(), result.get()); x_pow_i->mul(this, x_pow_i.get()); } result->share(0)->copy(ret->mutable_share(0)); result->share(1)->copy(ret->mutable_share(1)); } template< typename T, size_t N> void FixedPointTensor::polynomial_piecewise( const TensorAdapter* coeff, const TensorAdapter* break_point, FixedPointTensor* ret) const { // e.g., x.shape = {2, 3}, // break_point.shape = {k, 2, 3} (k: num of break point) // coeff.shape = {k + 1, n, 2, 3} (n: poly power) // copy ret auto ret_cpy_s0 = tensor_factory()->create_int64_t(ret->share(0)->shape()); ret->share(0)->copy(ret_cpy_s0.get()); auto ret_cpy_s1 = tensor_factory()->create_int64_t(ret->share(1)->shape()); ret->share(1)->copy(ret_cpy_s1.get()); std::shared_ptr> ret_cpy{new FixedPointTensor(ret_cpy_s0.get(), ret_cpy_s1.get())}; std::vector>> msb; int len_break_point = break_point->shape()[0]; int len_coeff = coeff->shape()[0]; //number of temp tensor used int temp_total = 4 * len_break_point + 2 + 2 * (len_break_point - 1) + 2 + 4 * len_coeff; std::vector>> temp; for (int i = 0; i < temp_total; ++i) { temp.emplace_back(tensor_factory()-> template create(this->shape())); } int temp_index = 0; // std::vector>> paddle_t_break; std::vector>> temp1; for (int i = 0; i < break_point->shape()[0]; ++i) { // msb[i] = msb(x - break_point[i]) auto t_break = tensor_factory()->template create(); break_point->slice(i, i + 1, t_break.get()); auto t_shape = t_break->shape(); // remove leading 1 t_shape.erase(t_shape.begin()); t_break->reshape(t_shape); temp1.emplace_back( std::make_shared>( temp[temp_index++].get(), temp[temp_index++].get())); this->sub(t_break.get(), temp1[i].get()); msb.emplace_back(std::make_shared>( temp[temp_index++].get(), temp[temp_index++].get())); msb[i]->bit_extract(sizeof(T) * 8 - 1, temp1[i].get()); } // b[0] = msb[0], b[i + 1] = ~ msb[i] & msb[i + 1] std::vector>> b; b.emplace_back(std::make_shared>( temp[temp_index++].get(), temp[temp_index++].get())); b[0] = msb[0]; for (int i = 0; i < len_break_point - 1; ++i) { b.emplace_back(std::make_shared>( temp[temp_index++].get(), temp[temp_index++].get())); msb[i]->bitwise_not(b[i + 1].get()); b[i + 1]->bitwise_and(msb[i + 1].get(), b[i + 1].get()); } b.emplace_back(std::make_shared>( temp[temp_index++].get(), temp[temp_index++].get())); msb[len_break_point - 1]->bitwise_not(b[len_break_point].get()); // ret += b[i].mul(polynomial(coeff[i])) std::vector>> temp_fixed; std::vector>> temp_fixed1; assign_to_tensor(ret_cpy->_share[0], (T) 0); assign_to_tensor(ret_cpy->_share[1], (T) 0); for (int i = 0; i < len_coeff; ++i) { temp_fixed.emplace_back( std::make_shared>( temp[temp_index++].get(), temp[temp_index++].get())); temp_fixed1.emplace_back( std::make_shared>( temp[temp_index++].get(), temp[temp_index++].get())); auto t = tensor_factory()->template create(); coeff->slice(i, i + 1, t.get()); auto t_shape = t->shape(); // remove leading 1 t_shape.erase(t_shape.begin()); t->reshape(t_shape);; this->polynomial(t.get(), temp_fixed[i].get()); b[i]->bit_extract(0, b[i].get()); b[i]->mul(temp_fixed[i].get(), temp_fixed1[i].get()); ret_cpy->add(temp_fixed1[i].get(), ret_cpy.get()); } ret_cpy->share(0)->copy(ret->mutable_share(0)); ret_cpy->share(1)->copy(ret->mutable_share(1)); } template template class CTensor, size_t... N1> void FixedPointTensor::lt(const CTensor* rhs, BooleanTensor* ret) const { std::vector>> temp; for (int i = 0; i < 2; ++i) { temp.emplace_back( tensor_factory()->template create(this->shape())); } std::shared_ptr> sub_result = std::make_shared>( temp[0].get(), temp[1].get()); this->sub(rhs, sub_result.get()); ret->bit_extract(sizeof(T) * 8 - 1, sub_result.get()); } template template class CTensor, size_t... N1> void FixedPointTensor::leq(const CTensor* rhs, BooleanTensor* ret) const { this->gt(rhs, ret); auto tensor_one = tensor_factory()-> template create(this->shape()); assign_to_tensor(tensor_one.get(), (T) 1); ret->bitwise_xor(tensor_one.get(), ret); } template template class CTensor, size_t... N1> void FixedPointTensor::gt(const CTensor* rhs, BooleanTensor* ret) const { std::vector>> temp; for (int i = 0; i < 2; ++i) { temp.emplace_back( tensor_factory()->template create(this->shape())); } std::shared_ptr> sub_result = std::make_shared>( temp[0].get(), temp[1].get()); this->sub(rhs, sub_result.get()); sub_result->negative(sub_result.get()); ret->template bit_extract(sizeof(T) * 8 - 1, sub_result.get()); } template template class CTensor, size_t... N1> void FixedPointTensor::geq(const CTensor* rhs, BooleanTensor* ret) const { this->lt(rhs, ret); auto tensor_one = tensor_factory()-> template create(this->shape()); assign_to_tensor(tensor_one.get(), (T) 1); ret->bitwise_xor(tensor_one.get(), ret); } template template class CTensor, size_t... N1> void FixedPointTensor::eq(const CTensor* rhs, BooleanTensor* ret) const { this->neq(rhs, ret); auto tensor_one = tensor_factory()->template create(this->shape()); assign_to_tensor(tensor_one.get(), (T) 1); ret->bitwise_xor(tensor_one.get(), ret); } template template class CTensor, size_t... N1> void FixedPointTensor::neq(const CTensor* rhs, BooleanTensor* ret) const { std::vector>> temp; for (int i = 0; i < 4; i ++) { temp.emplace_back(tensor_factory()-> template create(this->shape())); } std::shared_ptr> lt = std::make_shared>( temp[0].get(), temp[1].get()); std::shared_ptr> gt = std::make_shared>( temp[2].get(), temp[3].get()); this->lt(rhs, lt.get()); this->gt(rhs, gt.get()); lt->bitwise_or(gt.get(), ret); } template void FixedPointTensor::reciprocal(const FixedPointTensor* op, FixedPointTensor* ret, size_t iter, double x0) { auto temp0 = tensor_factory()->template create(ret->shape()); auto temp1 = tensor_factory()->template create(ret->shape()); auto temp2 = tensor_factory()->template create(ret->shape()); auto temp3 = tensor_factory()->template create(ret->shape()); std::shared_ptr> result = std::make_shared>(temp0.get(), temp1.get()); std::shared_ptr> x_copy = std::make_shared>(temp2.get(), temp3.get()); assign_to_tensor(result->mutable_share(0), (T) 0); assign_to_tensor(result->mutable_share(1), (T) 0); auto tensor_x0 = tensor_factory()->template create(op->shape()); assign_to_tensor(tensor_x0.get(), (T)(x0 * pow(2, N))); tensor_x0->scaling_factor() = N; result->add(tensor_x0.get(), result.get()); auto tensor_2 = tensor_factory()->template create(op->shape()); tensor_2->scaling_factor() = N; assign_to_tensor(tensor_2.get(), (T)(2 << N)); for (int i = 0; i < iter; ++i) { result->share(0)->copy(x_copy->mutable_share(0)); result->share(1)->copy(x_copy->mutable_share(1)); auto res_ptr = result.get(); op->mul(res_ptr, res_ptr); result->negative(res_ptr); result->add(tensor_2.get(), res_ptr); x_copy->mul(res_ptr, res_ptr); } result->share(0)->copy(ret->mutable_share(0)); result->share(1)->copy(ret->mutable_share(1)); } template void FixedPointTensor::inverse_square_root(FixedPointTensor* ret, size_t iter, double x0) const { inverse_square_root(this, ret, iter, x0); } // Newton's method, var naming from Quake III Arena: Q_rsqrt // float threehalfs = 1.5F; // x2 = number * 0.5F; // y = x0; // since 0x5f3759df does not fit fixed-point arithmetic // y = y * ( threehalfs - ( x2 * y * y ) ); // iteration of Newton's method template void FixedPointTensor::inverse_square_root(const FixedPointTensor* op, FixedPointTensor* ret, size_t iter, double x0) { std::vector>> temp; for (int i = 0; i < 7; ++i) { temp.emplace_back( tensor_factory()->template create(op->shape())); } std::shared_ptr> y = std::make_shared>(temp[0].get(), temp[1].get()); std::shared_ptr> x2 = std::make_shared>(temp[2].get(), temp[3].get()); // x2 = 0.5 * op truncate(op, x2.get(), 1); assign_to_tensor(y->mutable_share(0), (T)(x0 * pow(2, N))); assign_to_tensor(y->mutable_share(1), (T)(x0 * pow(2, N))); // threehalfs temp[4]->scaling_factor() = N; assign_to_tensor(temp[4].get(), T(1.5 * pow(2, N))); std::shared_ptr> y_copy = std::make_shared>(temp[5].get(), temp[6].get()); for (int i = 0; i < iter; ++i) { y->share(0)->copy(y_copy->mutable_share(0)); y->share(1)->copy(y_copy->mutable_share(1)); y->mul(y.get(), y.get()); y->mul(x2.get(), y.get()); y->negative(y.get()); y->add(temp[4].get(), y.get()); y_copy->mul(y.get(), y.get()); } y->share(0)->copy(ret->mutable_share(0)); y->share(1)->copy(ret->mutable_share(1)); } template template class CTensor, size_t... N1> void FixedPointTensor::max(const CTensor* rhs, FixedPointTensor* ret, BooleanTensor* cmp) const { // max = lhs + (rhs - lhs) if rhs > lhs else lhs std::vector>> temp; bool output_cmp = cmp != nullptr; // if cmp is not null, store cmp results in cmp // else, store them in tmp tensors for (int i = 0; i < 2 + 2 * (!output_cmp); ++i) { temp.emplace_back( tensor_factory()->template create(this->shape())); } FixedPointTensor delta(temp[0].get(), temp[1].get()); sub(rhs, &delta); BooleanTensor sign; if (output_cmp) { sign = *cmp; } else { sign = BooleanTensor(temp[2].get(), temp[3].get()); } sign.template bit_extract(sizeof(T) * 8 - 1, &delta); delta.negative(&delta); sign.mul(&delta, &delta); add(&delta, ret); } template void FixedPointTensor::max_pooling(FixedPointTensor* ret, BooleanTensor* pos) const { size_t k = shape()[0]; std::vector>> tmp; for (int i = 0; i < 4; ++i) { tmp.emplace_back( tensor_factory()->template create()); } FixedPointTensor now(tmp[0].get(), tmp[1].get()); BooleanTensor cmp(tmp[2].get(), tmp[3].get()); auto cmp_ptr = pos ? &cmp : nullptr; share(0)->slice(0, 1, tmp[0].get()); share(1)->slice(0, 1, tmp[1].get()); tmp[0]->copy(ret->mutable_share(0)); tmp[1]->copy(ret->mutable_share(1)); if (pos) { pos->share(0)->slice(0, 1, tmp[2].get()); pos->share(1)->slice(0, 1, tmp[3].get()); // set init 1, slice_0 is larger than null if (party() == 0 || party() == 2) { size_t idx = 2 + (party() == 2); assign_to_tensor(tmp[idx].get(), T(1)); assign_to_tensor(tmp[5 - idx].get(), T(0)); } else { assign_to_tensor(tmp[2].get(), T(0)); assign_to_tensor(tmp[3].get(), T(0)); } } for (size_t i = 1; i < k; ++i) { share(0)->slice(i, i + 1, tmp[0].get()); share(1)->slice(i, i + 1, tmp[1].get()); if (pos) { pos->share(0)->slice(i, i + 1, tmp[2].get()); pos->share(1)->slice(i, i + 1, tmp[3].get()); } ret->max(&now, ret, cmp_ptr); } if (pos) { pos->onehot_from_cmp(); } } } // namespace aby3