From 9040e47317357c33d4d3ff74032e633d84363f5a Mon Sep 17 00:00:00 2001 From: luxuhui Date: Mon, 8 Jul 2019 15:30:04 +0800 Subject: [PATCH] support NHWC format and opt code for softmax op. N/A Signed-off-by: Luxuhui --- mace/core/tensor.h | 9 +- mace/ops/softmax.cc | 238 ++++++++++++------ .../ccbenchmark/mace/ops/softmax_benchmark.cc | 61 +++-- test/ccunit/mace/ops/softmax_test.cc | 1 + test/ccutils/mace/ops/ops_test_util.h | 2 + 5 files changed, 206 insertions(+), 105 deletions(-) diff --git a/mace/core/tensor.h b/mace/core/tensor.h index f4bd2a37..92414a72 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -111,7 +111,8 @@ class Tensor { scale_(0.f), zero_point_(0), minval_(0.f), - maxval_(0.f) {} + maxval_(0.f), + data_format_(DataFormat::NONE) {} Tensor(BufferBase *buffer, DataType dtype, bool is_weight = false, @@ -125,7 +126,8 @@ class Tensor { scale_(0.f), zero_point_(0), minval_(0.f), - maxval_(0.f) {} + maxval_(0.f), + data_format_(DataFormat::NONE) {} Tensor(const BufferSlice &buffer_slice, DataType dtype, @@ -140,7 +142,8 @@ class Tensor { scale_(0.f), zero_point_(0), minval_(0.f), - maxval_(0.f) { + maxval_(0.f), + data_format_(DataFormat::NONE) { buffer_ = &buffer_slice_; } diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 82a684b1..3784d2dd 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -43,120 +43,202 @@ class SoftmaxOp : public Operation { public: explicit SoftmaxOp(OpConstructContext *context) : Operation(context), - use_log_(Operation::GetOptionalArg("use_log", false)) {} + use_log_(Operation::GetOptionalArg("use_log", false)), + has_df_(Operation::GetOptionalArg("has_data_format", 0)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); - const Tensor *input = this->Input(0); - Tensor *output = this->Output(0); + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); + + if (isNCHW(input)) { // NCHW + return RunForNCHW(context); + } else { + return RunForNHWC(context); + } + } + + protected: + bool use_log_; + bool has_df_; + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); + + protected: + MaceStatus RunForNCHW(OpContext *context) { + const Tensor *input = this->Input(INPUT); const float *input_data = input->data(); + Tensor *output = this->Output(OUTPUT); float *output_data = output->mutable_data(); + MACE_CHECK(input->dim_size() == 4, "The dim size of NCHW should be 4."); + index_t hw_stride = input->dim(3); + index_t hw_size = hw_stride * input->dim(2); + index_t class_stride = hw_size; + index_t class_size = class_stride * input->dim(1); + index_t batch_stride = class_size; + index_t batch_size = batch_stride * input->dim(0); + + Buffer cache_buffer(context->device()->allocator()); + MACE_RETURN_IF_ERROR(cache_buffer.Allocate(hw_size * sizeof(float))); utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); + float std_lowest = std::numeric_limits::lowest(); + float *cache_ptr = cache_buffer.mutable_data(); - // softmax for nchw image - if (input->dim_size() == 4) { - const index_t batch = input->dim(0); - const index_t class_count = input->dim(1); - const index_t class_size = input->dim(2) * input->dim(3); - const index_t batch_size = class_count * class_size; + for (index_t b_offset = 0; + b_offset < batch_size; b_offset += batch_stride) { + const float *input_b_base = input_data + b_offset; + float *output_b_base = output_data + b_offset; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + const auto raw_step_size = step * sizeof(float); + for (index_t k = start; k < end; k += step) { + float *cache_k_ptr = cache_ptr + k; + for (index_t i = 0; i < step; ++i) { + cache_k_ptr[i] = std_lowest; + } + } - for (index_t b = 0; b < batch; ++b) { - thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t c_offset = 0; c_offset < class_size; + c_offset += class_stride) { + const float *input_c_base = input_b_base + c_offset; for (index_t k = start; k < end; k += step) { - const float *input_ptr = input_data + b * batch_size + k; - float *output_ptr = output_data + b * batch_size + k; - - float max_val = std::numeric_limits::lowest(); - index_t channel_offset = 0; - for (index_t c = 0; c < class_count; ++c) { - float data = input_ptr[channel_offset]; - if (data > max_val) { - max_val = data; - } - channel_offset += class_size; + const float *input_ptr = input_c_base + k; + float *cache_k_ptr = cache_ptr + k; + for (index_t i = 0; i < step; ++i) { + cache_k_ptr[i] = std::max(cache_k_ptr[i], input_ptr[i]); } + } + } - channel_offset = 0; - float sum = 0; - for (index_t c = 0; c < class_count; ++c) { - float exp_value = ::exp(input_ptr[channel_offset] - max_val); - sum += exp_value; - output_ptr[channel_offset] = exp_value; - channel_offset += class_size; + for (index_t c_offset = 0; c_offset < class_size; + c_offset += class_stride) { + const float *input_c_base = input_b_base + c_offset; + float *output_c_base = output_b_base + c_offset; + for (index_t k = start; k < end; k += step) { + const float *input_ptr = input_c_base + k; + float *output_ptr = output_c_base + k; + float *cache_k_ptr = cache_ptr + k; + for (index_t i = 0; i < step; ++i) { + output_ptr[i] = ::exp(input_ptr[i] - cache_k_ptr[i]); } + } + } - sum = std::max(sum, std::numeric_limits::min()); - channel_offset = 0; - if (use_log_) { - for (index_t c = 0; c < class_count; ++c) { - output_ptr[channel_offset] /= sum; - output_ptr[channel_offset] = - std::log(output_ptr[channel_offset]); - channel_offset += class_size; - } - } else { - for (index_t c = 0; c < class_count; ++c) { - output_ptr[channel_offset] /= sum; - channel_offset += class_size; + for (index_t k = start; k < end; k += step) { + memset(cache_ptr + k, 0, raw_step_size); + } + + for (index_t c_offset = 0; c_offset < class_size; + c_offset += class_stride) { + float *output_c_base = output_b_base + c_offset; + for (index_t k = start; k < end; k += step) { + float *output_ptr = output_c_base + k; + float *cache_k_ptr = cache_ptr + k; + for (index_t i = 0; i < step; ++i) { + cache_k_ptr[i] += output_ptr[i]; + } + } + } + + for (index_t c_offset = 0; c_offset < class_size; + c_offset += class_stride) { + float *output_c_base = output_b_base + c_offset; + for (index_t k = start; k < end; k += step) { + float *output_ptr = output_c_base + k; + float *cache_k_ptr = cache_ptr + k; + for (index_t i = 0; i < step; ++i) { + output_ptr[i] = output_ptr[i] / cache_k_ptr[i]; + } + } + } + + if (use_log_) { + for (index_t c_offset = 0; c_offset < class_size; + c_offset += class_stride) { + float *output_c_base = output_b_base + c_offset; + for (index_t k = start; k < end; k += step) { + float *output_ptr = output_c_base + k; + for (index_t i = 0; i < step; ++i) { + output_ptr[i] = std::log(output_ptr[i]); } } - } // k - }, 0, class_size, 1); - } // b - } else if (input->dim_size() == 2 || input->dim_size() == 3) { - // normal 2d softmax and 3d softmax (dim(0) is batch) - index_t class_size = 0; - index_t class_count = 0; - if (input->dim_size() == 2) { - class_size = input->dim(0); - class_count = input->dim(1); - } else { - class_size = input->dim(0) * input->dim(1); - class_count = input->dim(2); - } + } + } // use_log_ + }, 0, hw_size, hw_stride); + } + + return MaceStatus::MACE_SUCCESS; + } + + MaceStatus RunForNHWC(OpContext *context) { + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); + float *output_data = output->mutable_data(); + + MACE_CHECK(input->dim_size() >= 2, "The input->dim_size() >= 2 failed."); + index_t class_size = input->dim(input->dim_size() - 1); + index_t hw_stride = class_size; + index_t hw_size = std::accumulate(input->shape().begin() + 1, + input->shape().end() - 1, + hw_stride, + std::multiplies()); + index_t batch_stride = hw_size; + index_t batch_size = std::accumulate(input->shape().begin(), + input->shape().end(), + 1, + std::multiplies()); + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + const float *input_data = input->data(); + float std_lowest = std::numeric_limits::lowest(); + for (index_t b_offset = 0; b_offset < batch_size; + b_offset += batch_stride) { + const float *input_b_ptr = input_data + b_offset; + float *output_b_ptr = output_data + b_offset; thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { for (index_t k = start; k < end; k += step) { - const float *input_ptr = input_data + k * class_count; - float *output_ptr = output_data + k * class_count; + const float *input_ptr = input_b_ptr + k; + float *output_ptr = output_b_ptr + k; - float max_val = std::numeric_limits::lowest(); - for (index_t c = 0; c < class_count; ++c) { + float max_val = std_lowest; + for (index_t c = 0; c < class_size; ++c) { max_val = std::max(max_val, input_ptr[c]); } float sum = 0; - for (index_t c = 0; c < class_count; ++c) { - float exp_value = std::exp(input_ptr[c] - max_val); + for (index_t c = 0; c < class_size; ++c) { + float exp_value = ::exp(input_ptr[c] - max_val); sum += exp_value; output_ptr[c] = exp_value; } - sum = std::max(sum, std::numeric_limits::min()); if (use_log_) { - for (index_t c = 0; c < class_count; ++c) { + for (index_t c = 0; c < class_size; ++c) { output_ptr[c] /= sum; output_ptr[c] = std::log(output_ptr[c]); } } else { - for (index_t c = 0; c < class_count; ++c) { + for (index_t c = 0; c < class_size; ++c) { output_ptr[c] /= sum; } } - } - }, 0, class_size, 1); - } else { - MACE_NOT_IMPLEMENTED; - } + } // k + }, 0, hw_size, hw_stride); + } // b_offset + return MaceStatus::MACE_SUCCESS; } - protected: - bool use_log_; + inline bool isNCHW(const Tensor *input) { + auto data_format = input->data_format(); + auto dim_size = input->dim_size(); + return dim_size == 4 && (has_df_ || data_format == DataFormat::NCHW); + } }; #ifdef MACE_ENABLE_QUANTIZE @@ -173,8 +255,8 @@ class SoftmaxOp : public Operation { MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); MACE_CHECK(!use_log_, "MACE dose not support quantized logsoftmax yet."); - const Tensor *input = this->Input(0); - Tensor *output = this->Output(0); + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); // Ignore range stat, fix range to [0, 1]. For large depth, each softmax // output may be too small (<<1), which causes precision issue. But it is @@ -403,6 +485,8 @@ class SoftmaxOp : public Operation { protected: bool use_log_; + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; #endif // MACE_ENABLE_QUANTIZE @@ -421,8 +505,8 @@ class SoftmaxOp : public Operation { } } MaceStatus Run(OpContext *context) override { - const Tensor *input = this->Input(0); - Tensor *output = this->Output(0); + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); return kernel_->Compute(context, input, output); @@ -430,6 +514,8 @@ class SoftmaxOp : public Operation { private: std::unique_ptr kernel_; + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; #endif // MACE_ENABLE_OPENCL diff --git a/test/ccbenchmark/mace/ops/softmax_benchmark.cc b/test/ccbenchmark/mace/ops/softmax_benchmark.cc index a5b06c50..34d1e65e 100644 --- a/test/ccbenchmark/mace/ops/softmax_benchmark.cc +++ b/test/ccbenchmark/mace/ops/softmax_benchmark.cc @@ -22,17 +22,17 @@ namespace ops { namespace test { namespace { -template -void SoftmaxBenchmark( - int iters, int batch, int channels, int height, int width) { +template +void SoftmaxBenchmark(int iters, int batch, int channels, + int height, int width, DataFormat data_format) { mace::testing::StopTiming(); OpsTestNet net; // Add input data - if (D == DeviceType::CPU) { + if (D == DeviceType::CPU && data_format == DataFormat::NCHW) { net.AddRandomInput("Input", {batch, channels, height, width}); - } else if (D == DeviceType::GPU) { + } else if (D == DeviceType::GPU || data_format == DataFormat::NHWC) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; @@ -42,6 +42,7 @@ void SoftmaxBenchmark( .Input("Input") .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", data_format == DataFormat::NCHW) .Finalize(net.NewOperatorDef()); // Warm-up @@ -58,10 +59,12 @@ void SoftmaxBenchmark( } #ifdef MACE_ENABLE_QUANTIZE -template <> +template<> void SoftmaxBenchmark( - int iters, int batch, int channels, int height, int width) { + int iters, int batch, int channels, int height, + int width, DataFormat data_format) { mace::testing::StopTiming(); + MACE_UNUSED(data_format); OpsTestNet net; @@ -100,33 +103,39 @@ void SoftmaxBenchmark( } // namespace -#define MACE_BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ +#define MACE_BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE, DF) \ + static void \ + MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE##_##DF( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - SoftmaxBenchmark(iters, N, C, H, W); \ + SoftmaxBenchmark(iters, N, C, H, W, (DataFormat::DF)); \ } \ - MACE_BENCHMARK(MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE##_##DF) #if defined(MACE_ENABLE_OPENCL) && defined(MACE_ENABLE_QUANTIZE) -#define MACE_BM_SOFTMAX(N, C, H, W) \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, uint8_t, CPU); \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU) +#define MACE_BM_SOFTMAX(N, C, H, W) \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NCHW); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NHWC); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, uint8_t, CPU, NHWC); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU, NHWC); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU, NHWC) #elif defined(MACE_ENABLE_OPENCL) -#define MACE_BM_SOFTMAX(N, C, H, W) \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU) +#define MACE_BM_SOFTMAX(N, C, H, W) \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NCHW); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NHWC); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU, NHWC); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU, NHWC) #elif defined(MACE_ENABLE_QUANTIZE) -#define MACE_BM_SOFTMAX(N, C, H, W) \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, uint8_t, CPU) +#define MACE_BM_SOFTMAX(N, C, H, W) \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NCHW); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NHWC); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, uint8_t, CPU, NHWC) #else -#define MACE_BM_SOFTMAX(N, C, H, W) \ - MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU) +#define MACE_BM_SOFTMAX(N, C, H, W) \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NCHW); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU, NHWC) #endif MACE_BM_SOFTMAX(1, 2, 512, 512); diff --git a/test/ccunit/mace/ops/softmax_test.cc b/test/ccunit/mace/ops/softmax_test.cc index eb3398db..b73309b9 100644 --- a/test/ccunit/mace/ops/softmax_test.cc +++ b/test/ccunit/mace/ops/softmax_test.cc @@ -55,6 +55,7 @@ void Simple(bool use_log = false) { OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputNCHW") .Output("OutputNCHW") + .AddIntArg("has_data_format", 1) .AddIntArg("use_log", static_cast(use_log)) .Finalize(net.NewOperatorDef()); diff --git a/test/ccutils/mace/ops/ops_test_util.h b/test/ccutils/mace/ops/ops_test_util.h index a6e2aa8d..0869cea1 100644 --- a/test/ccutils/mace/ops/ops_test_util.h +++ b/test/ccutils/mace/ops/ops_test_util.h @@ -244,6 +244,7 @@ class OpsTestNet { } } } + output->set_data_format(DataFormat::NCHW); } else if (src_format == DataFormat::NCHW && dst_format == DataFormat::NHWC) { index_t batch = input_shape[0]; @@ -265,6 +266,7 @@ class OpsTestNet { } } } + output->set_data_format(DataFormat::NHWC); } else { MACE_NOT_IMPLEMENTED; } -- GitLab