提交 4ed6e236 编写于 作者: L Liangliang He

Merge branch 'master' into 'master'

Implement ReluN

See merge request !46
......@@ -12,26 +12,53 @@ template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
float *output,
index_t size) {
#pragma omp parallel for num_threads(1) // no significant performance improve
for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
int nn = count >> 2;
int remain = count - (nn << 2);
const float *inptr = input + i;
float *outptr = output + i;
float32x4_t _zero = vdupq_n_f32(0.f);
for (; nn > 0; --nn) {
float32x4_t _inptr = vld1q_f32(inptr);
float32x4_t _outptr = vmaxq_f32(_inptr, _zero);
vst1q_f32(outptr, _outptr);
if (max_limit_ < 0) {
#pragma omp parallel for num_threads(1) // no significant perf improve
for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
int block = count >> 2;
int remain = count - (block << 2);
const float *inptr = input + i;
float *outptr = output + i;
float32x4_t zero = vdupq_n_f32(0.f);
for (; block > 0; --block) {
float32x4_t in = vld1q_f32(inptr);
float32x4_t out = vmaxq_f32(in, zero);
vst1q_f32(outptr, out);
inptr += 4;
outptr += 4;
inptr += 4;
outptr += 4;
}
for (; remain > 0; --remain) {
*outptr = std::max(*inptr, 0.f);
++inptr;
++outptr;
}
}
for (; remain > 0; --remain) {
*outptr = std::max(*inptr, 0.f);
++inptr;
++outptr;
} else {
#pragma omp parallel for num_threads(1) // no significant perf improve
for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
int block = count >> 2;
int remain = count - (block << 2);
const float *inptr = input + i;
float *outptr = output + i;
float32x4_t zero = vdupq_n_f32(0.f);
float32x4_t vmax = vdupq_n_f32(max_limit_);
for (; block > 0; --block) {
float32x4_t in = vld1q_f32(inptr);
float32x4_t out = vmaxq_f32(in, zero);
out = vminq_f32(out, vmax);
vst1q_f32(outptr, out);
inptr += 4;
outptr += 4;
}
for (; remain > 0; --remain) {
*outptr = std::min(std::max(*inptr, 0.f), max_limit_);
++inptr;
++outptr;
}
}
}
};
......
......@@ -12,9 +12,17 @@ namespace kernels {
template <DeviceType D, typename T>
struct ReluFunctor {
T max_limit_;
void operator()(const T *input, T *output, index_t size) {
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], static_cast<T>(0));
if (max_limit_ < 0) {
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], static_cast<T>(0));
}
} else {
for (index_t i = 0; i < size; ++i) {
output[i] = std::min(std::max(input[i], static_cast<T>(0)), max_limit_);
}
}
}
};
......
......@@ -14,7 +14,10 @@ template <DeviceType D, class T>
class ReluOp : public Operator<D, T> {
public:
ReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<D, T>(operator_def, ws) {}
: Operator<D, T>(operator_def, ws) {
functor_.max_limit_ =
OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
}
bool Run() override {
const Tensor* input_tensor = this->inputs_[0];
Tensor* output_tensor = this->outputs_[0];
......
......@@ -18,7 +18,7 @@ TEST_F(ReluOpTest, ReluOp) {
.Finalize(net.operator_def());
// Add input data
net.AddRandomInput<float>("Input", {1, 2, 3, 4});
net.AddRandomInput<float>("Input", {1, 2, 3, 5});
// Run
net.RunOp();
......@@ -32,4 +32,29 @@ TEST_F(ReluOpTest, ReluOp) {
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(ReluOpTest, ReluOpWithMax) {
// Construct graph
auto& net = test_net();
OpDefBuilder("Relu", "ReluTestWithMax")
.Input("Input")
.Output("Output")
.Finalize(net.operator_def());
// Add input data
net.AddRandomInput<float>("Input", {1, 2, 3, 5});
net.AddFloatArg("max_limit", 0.5);
// Run
net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Check
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
}
} // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册