diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h index 93d38b7d2299d994cde0934213668a525bffa80c..b2bf334dab9799153fe1d4fe2c74cce9d57168b9 100644 --- a/paddle/cuda/include/hl_tensor_ops.h +++ b/paddle/cuda/include/hl_tensor_ops.h @@ -461,7 +461,7 @@ class add { public: INLINE float32x4_t operator()(const float32x4_t a, const float32x4_t b) const { - return vmulq_f32(a, b); + return vaddq_f32(a, b); } }; diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index 5435808fb7f70fdf1ac98815f7fe8890fb85527c..53dd5383601782231e6e742784007d1c9154dc6b 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "BaseMatrix.h" #include "MathFunctions.h" +#include "NEONFunctions.h" #include "SIMDFunctions.h" #include "hl_matrix_apply.cuh" #include "hl_matrix_base.cuh" @@ -666,6 +667,13 @@ void BaseMatrixT::relu(BaseMatrixT& b) { applyBinary(binary::Relu(), b); } +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +template <> +void BaseMatrixT::relu(BaseMatrixT& b) { + neon::relu(data_, b.data_, height_ * width_); +} +#endif + DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f)); template void BaseMatrixT::reluDerivative(BaseMatrixT& b) { diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3bf47901f1069ac228fa1b877e29848d8cc130e8 --- /dev/null +++ b/paddle/math/NEONFunctions.cpp @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +#include "NEONFunctions.h" +#include + +namespace paddle { +namespace neon { + +// b[i] = a[i] > 0.0f ? a[i] : 0.0f +void relu(const float* a, float* b, int len) { + int offset = len % 16; + float32x4_t ma0, ma1, ma2, ma3; + float32x4_t mb0, mb1, mb2, mb3; + + float32x4_t zero = vdupq_n_f32(0.f); + for (int k = 0; k < len / 16; k++, a += 16, b += 16) { + ma0 = vld1q_f32(a); + ma1 = vld1q_f32(a + 4); + ma2 = vld1q_f32(a + 8); + ma3 = vld1q_f32(a + 12); + + mb0 = vmaxq_f32(ma0, zero); + mb1 = vmaxq_f32(ma1, zero); + mb2 = vmaxq_f32(ma2, zero); + mb3 = vmaxq_f32(ma3, zero); + + vst1q_f32(b, mb0); + vst1q_f32(b + 4, mb1); + vst1q_f32(b + 8, mb2); + vst1q_f32(b + 12, mb3); + } + + for (int i = 0; i < offset; i++) { + b[i] = a[i] > 0.0f ? a[i] : 0.0f; + } +} + +} // namespace neon +} // namespace paddle + +#endif diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..69085e333547a31a341fbfde247f1e30adb957ee --- /dev/null +++ b/paddle/math/NEONFunctions.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace neon { + +void relu(const float* a, float* b, int len); + +} // namespace neon +} // namespace paddle