diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake index 76f62765aff791594123d689341b0876b3d0184d..0597ef0cc4ba4c0bcec172c767d66d0f362e1459 100644 --- a/cmake/cross_compiling/ios.cmake +++ b/cmake/cross_compiling/ios.cmake @@ -120,6 +120,7 @@ # ## Lite settings +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto") if (ARM_TARGET_OS STREQUAL "ios") set(PLATFORM "OS") elseif(ARM_TARGET_OS STREQUAL "ios64") diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 64cced8009e57e58c532c9b8fcf21f184ccdbe25..de1a76c9c391102b8d7a1d113164f45beb913e6e 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -305,6 +305,26 @@ if(NOT IOS) FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + + lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) endif() #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..addd512eb0039c43edeca562b8f568528aab76f9 --- /dev/null +++ b/lite/api/lite_multithread_test.cc @@ -0,0 +1,360 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/device_info.h" +#include "lite/core/profile/timer.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/basic_profiler.h" +#endif // LITE_WITH_PROFILE +#include // NOLINT + +using paddle::lite::profile::Timer; + +DEFINE_string(input_shape, + "1,3,224,224", + "input shapes, separated by colon and comma"); + +DEFINE_string(model_dir_0, "", "model_dir_0"); +DEFINE_string(input_shape_0, + "1,3,224,224", + "input shapes another, separated by colon and comma"); + +DEFINE_bool(use_optimize_nb, + false, + "optimized & naive buffer model for mobile devices"); + +DEFINE_int32(test_type, 0, "multithread test type"); + +namespace paddle { +namespace lite_api { + +void OutputOptModel(const std::string& load_model_dir, + const std::string& save_optimized_model_dir, + const std::vector>& input_shapes) { + lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + auto predictor = lite_api::CreatePaddlePredictor(config); + + // delete old optimized model + int ret = system( + paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) + .c_str()); + if (ret == 0) { + LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; + } + predictor->SaveOptimizedModel(save_optimized_model_dir, + LiteModelType::kNaiveBuffer); + LOG(INFO) << "Load model from " << load_model_dir; + LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; +} + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +void Run(const std::vector>& input_shapes, + const std::string& model_dir, + const PowerMode power_mode, + const int thread_num, + const int repeat, + int tid, + const int warmup_times = 5) { + lite_api::MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + auto predictor = lite_api::CreatePaddlePredictor(config); + + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; + } + for (int i = 0; i < input_num; ++i) { + input_data[i] = 1.f; + } + } + + for (int i = 0; i < warmup_times; ++i) { + predictor->Run(); + } + + Timer ti; + for (int j = 0; j < repeat; ++j) { + ti.Start(); + predictor->Run(); + float t = ti.Stop(); + auto output = predictor->GetOutput(0); + auto out = output->data(); + LOG(INFO) << "[thread " << tid << "] Model: " << model_dir + << " output[0]:" << out[0] << "; output[1]:" << out[1]; + } + LOG(INFO) << "[thread " << tid << "] Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num + << ", avg time: " << ti.LapTimes().Avg() << "ms" + << ", min time: " << ti.LapTimes().Min() << " ms" + << ", max time: " << ti.LapTimes().Max() << " ms."; +} + +void RunTestType_00(const std::vector>& input_shapes, + const std::string& model_dir, + const PowerMode power_mode, + const int thread_num, + const int repeat, + const int warmup_times = 5) { + std::thread run_th0(Run, + input_shapes, + model_dir, + power_mode, + thread_num, + repeat, + 0, + warmup_times); + Run(input_shapes, model_dir, power_mode, thread_num, repeat, 1, warmup_times); + run_th0.join(); +} +void RunTestType_01(const std::vector>& input_shapes, + const std::string& model_dir, + const std::vector>& input_shapes_0, + const std::string& model_dir_0, + const PowerMode power_mode, + const int thread_num, + const int repeat, + const int warmup_times = 5) { + std::thread run_th0(Run, + input_shapes, + model_dir, + power_mode, + thread_num, + repeat, + 0, + warmup_times); + Run(input_shapes_0, + model_dir_0, + power_mode, + thread_num, + repeat, + 1, + warmup_times); + run_th0.join(); +} + +void run_with_predictor(std::shared_ptr predictor, + const std::vector>& input_shapes, + int index, + const std::string& name) { + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; + } + for (int i = 0; i < input_num; ++i) { + input_data[i] = 1.f; + } + } + + Timer ti; + ti.Start(); + predictor->Run(); + float t = ti.Stop(); + + auto output = predictor->GetOutput(0); + auto out = output->data(); + LOG(INFO) << "[thread " << index << "] name: " << name + << ",run time: " << ti.LapTimes().Avg() << "ms" + << " output[0]:" << out[0] << "; output[1]:" << out[1]; +} +void RunTestType_10(const std::vector>& input_shapes, + const std::string& model_dir, + const PowerMode power_mode, + const int thread_num, + const int repeat, + int warmup = 5) { + lite_api::MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + auto predictor = lite_api::CreatePaddlePredictor(config); + + for (int i = 0; i < repeat; ++i) { + std::thread pre_th0( + run_with_predictor, predictor, input_shapes, i, model_dir); + pre_th0.join(); + } +} +void RunTestType_11(const std::vector>& input_shapes, + const std::string& model_dir, + const std::vector>& input_shapes_0, + const std::string& model_dir_0, + const PowerMode power_mode, + const int thread_num, + const int repeat, + int warmup = 5) { + lite_api::MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + auto predictor = lite_api::CreatePaddlePredictor(config); + + config.set_model_dir(model_dir_0); + auto predictor_0 = lite_api::CreatePaddlePredictor(config); + + for (int i = 0; i < 2 * repeat; i += 2) { + std::thread pre_th0( + run_with_predictor, predictor, input_shapes, i, model_dir); + std::thread pre_th1( + run_with_predictor, predictor_0, input_shapes_0, i + 1, model_dir_0); + pre_th0.join(); + pre_th1.join(); + } +} + +#endif + +} // namespace lite_api +} // namespace paddle + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir == "") { + LOG(INFO) << "usage: " + << "--model_dir /path/to/your/model"; + exit(0); + } + std::string save_optimized_model_dir = ""; + std::string save_optimized_model_dir_0 = ""; + if (FLAGS_use_optimize_nb) { + save_optimized_model_dir = FLAGS_model_dir; + save_optimized_model_dir_0 = FLAGS_model_dir_0; + } else { + save_optimized_model_dir = FLAGS_model_dir + "opt2"; + save_optimized_model_dir_0 = FLAGS_model_dir_0 + "opt2"; + } + + auto split_string = + [](const std::string& str_in) -> std::vector { + std::vector str_out; + std::string tmp_str = str_in; + while (!tmp_str.empty()) { + size_t next_offset = tmp_str.find(":"); + str_out.push_back(tmp_str.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return str_out; + }; + + auto get_shape = [](const std::string& str_shape) -> std::vector { + std::vector shape; + std::string tmp_str = str_shape; + while (!tmp_str.empty()) { + int dim = atoi(tmp_str.data()); + shape.push_back(dim); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return shape; + }; + + std::vector str_input_shapes = split_string(FLAGS_input_shape); + std::vector> input_shapes; + for (int i = 0; i < str_input_shapes.size(); ++i) { + input_shapes.push_back(get_shape(str_input_shapes[i])); + } + std::vector str_input_shapes_0 = + split_string(FLAGS_input_shape_0); + std::vector> input_shapes_0; + for (int i = 0; i < str_input_shapes_0.size(); ++i) { + input_shapes_0.push_back(get_shape(str_input_shapes_0[i])); + } + + if (!FLAGS_use_optimize_nb) { + // Output optimized model + paddle::lite_api::OutputOptModel( + FLAGS_model_dir, save_optimized_model_dir, input_shapes); + paddle::lite_api::OutputOptModel( + FLAGS_model_dir_0, save_optimized_model_dir_0, input_shapes_0); + } + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + // Run inference using optimized model + if (FLAGS_test_type == 0) { + paddle::lite_api::RunTestType_00( + input_shapes, + save_optimized_model_dir, + static_cast(0), + FLAGS_threads, + FLAGS_repeats, + 5); + LOG(INFO) << "=========above is case 0, below is case " + "1============================"; + paddle::lite_api::RunTestType_10( + input_shapes, + save_optimized_model_dir, + static_cast(0), + FLAGS_threads, + FLAGS_repeats); + } + if (FLAGS_test_type == 1) { + paddle::lite_api::RunTestType_01( + input_shapes, + save_optimized_model_dir, + input_shapes_0, + save_optimized_model_dir_0, + static_cast(0), + FLAGS_threads, + FLAGS_repeats, + 5); + LOG(INFO) << "=========above is case 0, below is case " + "1============================"; + paddle::lite_api::RunTestType_11( + input_shapes, + save_optimized_model_dir, + input_shapes_0, + save_optimized_model_dir_0, + static_cast(0), + FLAGS_threads, + FLAGS_repeats); + } + +#endif + return 0; +} diff --git a/lite/backends/x86/cpu_info.cc b/lite/backends/x86/cpu_info.cc index c2759d6191aaa7ba277ff2a935ea6fdda8383e1e..aa097f947a0289b4a44417160fbe5d6e6db48020 100644 --- a/lite/backends/x86/cpu_info.cc +++ b/lite/backends/x86/cpu_info.cc @@ -32,26 +32,37 @@ #include #include -DEFINE_double(fraction_of_cpu_memory_to_use, - 1, - "Default use 100% of CPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); -DEFINE_uint64(initial_cpu_memory_in_mb, - 500ul, - "Initial CPU memory for PaddlePaddle, in MD unit."); - -DEFINE_double( - fraction_of_cuda_pinned_memory_to_use, - 0.5, - "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," - "reserve the rest for page tables, etc"); +#include "lite/utils/env.h" + +// DEFINE_double(fraction_of_cpu_memory_to_use, +// 1, +// "Default use 100% of CPU memory for PaddlePaddle," +// "reserve the rest for page tables, etc"); +double fraction_of_cpu_memory_to_use = + paddle::lite::GetDoubleFromEnv("fraction_of_cpu_memory_to_use", 1); + +// DEFINE_uint64(initial_cpu_memory_in_mb, +// 500ul, +// "Initial CPU memory for PaddlePaddle, in MD unit."); +uint64_t initial_cpu_memory_in_mb = + paddle::lite::GetUInt64FromEnv("initial_cpu_memory_in_mb", 500ul); + +// DEFINE_double( +// fraction_of_cuda_pinned_memory_to_use, +// 0.5, +// "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," +// "reserve the rest for page tables, etc"); +double fraction_of_cuda_pinned_memory_to_use = paddle::lite::GetDoubleFromEnv( + "fraction_of_cuda_pinned_memory_to_use", 0.5); // If use_pinned_memory is true, CPUAllocator calls mlock, which // returns pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the amount // of memory available to the system for paging. So, by default, we // should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +bool use_pinned_memory = + paddle::lite::GetBoolFromEnv("use_pinned_memory", true); namespace paddle { namespace lite { @@ -81,7 +92,7 @@ size_t CpuTotalPhysicalMemory() { size_t CpuMaxAllocSize() { // For distributed systems, it requires configuring and limiting // the fraction of memory to use. - return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); + return fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); } size_t CpuMinChunkSize() { @@ -92,15 +103,14 @@ size_t CpuMinChunkSize() { size_t CpuMaxChunkSize() { // Allow to allocate the maximum chunk size is roughly 3% of CPU memory, // or the initial_cpu_memory_in_mb. - return std::min( - static_cast(CpuMaxAllocSize() / 32), - static_cast(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); + return std::min(static_cast(CpuMaxAllocSize() / 32), + static_cast(initial_cpu_memory_in_mb * 1 << 20)); } size_t CUDAPinnedMaxAllocSize() { // For distributed systems, it requires configuring and limiting // the fraction of memory to use. - return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory(); + return fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory(); } size_t CUDAPinnedMinChunkSize() { diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc index 75bb528f38664fc1061653e1036b73eed74daae9..a05a57e93b23008e49683764b5ed669d5c425e5b 100644 --- a/lite/backends/x86/dynamic_loader.cc +++ b/lite/backends/x86/dynamic_loader.cc @@ -22,36 +22,46 @@ limitations under the License. */ #include "lite/backends/x86/cupti_lib_path.h" #include "lite/backends/x86/port.h" #include "lite/backends/x86/warpctc_lib_path.h" +#include "lite/utils/env.h" #include "lite/utils/paddle_enforce.h" -DEFINE_string(cudnn_dir, - "", - "Specify path for loading libcudnn.so. For instance, " - "/usr/local/cudnn/lib. If empty [default], dlopen " - "will search cudnn from LD_LIBRARY_PATH"); +// DEFINE_string(cudnn_dir, +// "", +// "Specify path for loading libcudnn.so. For instance, " +// "/usr/local/cudnn/lib. If empty [default], dlopen " +// "will search cudnn from LD_LIBRARY_PATH"); +std::string cudnn_dir = paddle::lite::GetStringFromEnv("cudnn_dir"); // NOLINT -DEFINE_string(cuda_dir, - "", - "Specify path for loading cuda library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); +// DEFINE_string(cuda_dir, +// "", +// "Specify path for loading cuda library, such as libcublas, " +// "libcurand. For instance, /usr/local/cuda/lib64. If default, " +// "dlopen will search cuda from LD_LIBRARY_PATH"); +std::string cuda_dir = paddle::lite::GetStringFromEnv("cuda_dir"); // NOLINT -DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); +// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); +std::string f_warpctc_dir = // NOLINT + paddle::lite::GetStringFromEnv("warpctc_dir"); // NOLINT -DEFINE_string(nccl_dir, - "", - "Specify path for loading nccl library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); +// DEFINE_string(nccl_dir, +// "", +// "Specify path for loading nccl library, such as libcublas, " +// "libcurand. For instance, /usr/local/cuda/lib64. If default, " +// "dlopen will search cuda from LD_LIBRARY_PATH"); +std::string nccl_dir = paddle::lite::GetStringFromEnv("nccl_dir"); // NOLINT -DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); +// DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); +std::string cupti_dir = paddle::lite::GetStringFromEnv("cupti_dir"); // NOLINT -DEFINE_string( - tensorrt_dir, - "", - "Specify path for loading tensorrt library, such as libnvinfer.so."); +// DEFINE_string( +// tensorrt_dir, +// "", +// "Specify path for loading tensorrt library, such as libnvinfer.so."); +std::string tensorrt_dir = // NOLINT + paddle::lite::GetStringFromEnv("tensorrt_dir"); // NOLINT -DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); +// DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); +std::string mklml_dir = paddle::lite::GetStringFromEnv("mklml_dir"); // NOLINT namespace paddle { namespace lite { @@ -180,28 +190,28 @@ auto error_msg = void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); + return GetDsoHandleFromSearchPath(cuda_dir, win_cublas_lib); #else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.so"); #endif } void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); + return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.dylib", false); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); + return GetDsoHandleFromSearchPath(cudnn_dir, win_cudnn_lib); #else - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); + return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.so", false); #endif } void* GetCUPTIDsoHandle() { std::string cupti_path = cupti_lib_path; - if (!FLAGS_cupti_dir.empty()) { - cupti_path = FLAGS_cupti_dir; + if (!cupti_dir.empty()) { + cupti_path = cupti_dir; } #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false); @@ -212,18 +222,18 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); + return GetDsoHandleFromSearchPath(cuda_dir, win_curand_lib); #else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.so"); #endif } void* GetWarpCTCDsoHandle() { std::string warpctc_dir = warpctc_lib_path; - if (!FLAGS_warpctc_dir.empty()) { - warpctc_dir = FLAGS_warpctc_dir; + if (!f_warpctc_dir.empty()) { + warpctc_dir = f_warpctc_dir; } #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib"); @@ -236,27 +246,27 @@ void* GetWarpCTCDsoHandle() { void* GetNCCLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib"); + return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.dylib"); #else - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so"); + return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.so"); #endif } void* GetTensorRtDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); + return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.dylib"); #else - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); + return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.so"); #endif } void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); + return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib"); #elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); + return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll"); #else - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); + return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.so"); #endif } diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc index 38250d533dd8c94afc87b5f9113ea165d6b7e9ed..7d051aa6f5802844753b71fd43400e20b7f5965b 100644 --- a/lite/backends/x86/jit/gen_base.cc +++ b/lite/backends/x86/jit/gen_base.cc @@ -21,13 +21,15 @@ // posix_memalign #include "lite/backends/x86/cpu_info.h" #include "lite/backends/x86/jit/macro.h" +#include "lite/utils/env.h" #include "lite/utils/paddle_enforce.h" #ifndef _WIN32 #define posix_memalign_free free #endif -DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); +// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); +bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode"); namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen_base.h b/lite/backends/x86/jit/gen_base.h index b5f942615aa001a119273b52c70116ae66e66126..4af93c2447d64e52676a60e33c01c63ba7221910 100644 --- a/lite/backends/x86/jit/gen_base.h +++ b/lite/backends/x86/jit/gen_base.h @@ -20,7 +20,8 @@ #include #include "lite/backends/x86/jit/kernel_base.h" -DECLARE_bool(dump_jitcode); +// DECLARE_bool(dump_jitcode); +extern bool dump_jitcode; namespace paddle { namespace lite { @@ -36,7 +37,7 @@ class GenBase : public Kernel { template Func getCode() const { const unsigned char* code = this->getCodeInternal(); - if (FLAGS_dump_jitcode) { + if (dump_jitcode) { this->dumpCode(code); } // Note: failed to cast with reinterpret_cast on Mac clang, diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index 8d61fb3bbb97705c697fba934e6cab9424f85bad..645137c1d6b171a9d9aa8aa0b2fca9469bd112b0 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -86,7 +86,8 @@ class BeamSearchFunctor { // selected_ids->mutable_data(dims, platform::CPUPlace()); // auto *selected_scores_data = // selected_scores->mutable_data(dims, platform::CPUPlace()); - parent_idx->Resize({static_cast(num_instances)}); + parent_idx->Resize( + std::vector({static_cast(num_instances)})); auto *parent_idx_data = parent_idx ? parent_idx->mutable_data(TARGET(kX86)) : nullptr; // auto *parent_idx_data = diff --git a/lite/backends/x86/math/detail/avx_mathfun.h b/lite/backends/x86/math/detail/avx_mathfun.h index c95c881512900efb4b39df3ba16b8de686caefcb..2ad0866d6346a24690b30d0da317c6d86e9aebba 100644 --- a/lite/backends/x86/math/detail/avx_mathfun.h +++ b/lite/backends/x86/math/detail/avx_mathfun.h @@ -41,9 +41,11 @@ (this is the zlib license) */ - +#pragma once #include "lite/backends/x86/cpu_info.h" +namespace paddle { +namespace lite { /* __m128 is ugly to write */ typedef __m256 v8sf; // vector of 8 float (avx) typedef __m256i v8si; // vector of 8 int (avx) @@ -134,7 +136,7 @@ typedef union imm_xmm_union { return (ret); \ } -//#warning "Using SSE2 to perform AVX2 bitshift ops" +// #warning "Using SSE2 to perform AVX2 bitshift ops" AVX2_BITOP_USING_SSE2(slli_epi32) AVX2_BITOP_USING_SSE2(srli_epi32) @@ -152,7 +154,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32) return (ret); \ } -//#warning "Using SSE2 to perform AVX2 integer ops" +// #warning "Using SSE2 to perform AVX2 integer ops" AVX2_INTOP_USING_SSE2(and_si128) AVX2_INTOP_USING_SSE2(andnot_si128) AVX2_INTOP_USING_SSE2(cmpeq_epi32) @@ -175,23 +177,23 @@ AVX2_INTOP_USING_SSE2(add_epi32) */ v8sf log256_ps(v8sf x) { v8si imm0; - v8sf one = *(v8sf *)_ps256_1; + v8sf one = *(v8sf *)_ps256_1; // NOLINT // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); - x = _mm256_max_ps( - x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ + x = _mm256_max_ps(x, *(v8sf *)_ps256_min_norm_pos); // NOLINT + /* cut off denormalized stuff */ // NOLINT // can be done with AVX2 imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); /* keep only the fractional part */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); - x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); // NOLINT + x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); // NOLINT // this is again another AVX2 instruction - imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT v8sf e = _mm256_cvtepi32_ps(imm0); e = _mm256_add_ps(e, one); @@ -203,7 +205,8 @@ v8sf log256_ps(v8sf x) { } else { x = x - 1.0; } */ // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); - v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf mask = + _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); // NOLINT v8sf tmp = _mm256_and_ps(x, mask); x = _mm256_sub_ps(x, one); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); @@ -211,34 +214,34 @@ v8sf log256_ps(v8sf x) { v8sf z = _mm256_mul_ps(x, x); - v8sf y = *(v8sf *)_ps256_cephes_log_p0; + v8sf y = *(v8sf *)_ps256_cephes_log_p0; // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); // NOLINT y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, z); - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); // NOLINT y = _mm256_add_ps(y, tmp); - tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); // NOLINT x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, tmp); x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN @@ -262,14 +265,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1); v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; - v8sf one = *(v8sf *)_ps256_1; + v8sf one = *(v8sf *)_ps256_1; // NOLINT - x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); - x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); + x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); // NOLINT + x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); // NOLINT /* express exp(x) as exp(g + n*log(2)) */ - fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); - fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); + fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); // NOLINT + fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); // NOLINT /* how to perform a floorf with SSE: just below */ // imm0 = _mm256_cvttps_epi32(fx); @@ -283,24 +286,24 @@ v8sf exp256_ps(v8sf x) { mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); - v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); + tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); // NOLINT + v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); // NOLINT x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x, x); - v8sf y = *(v8sf *)_ps256_cephes_exp_p0; + v8sf y = *(v8sf *)_ps256_cephes_exp_p0; // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); @@ -308,7 +311,7 @@ v8sf exp256_ps(v8sf x) { /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions - imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); @@ -349,12 +352,12 @@ v8sf sin256_ps(v8sf x) { // any x sign_bit = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT /* extract the sign bit (upper one) */ - sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); + sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); // NOLINT /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT /* Here we start a series of integer operations, which are in the @@ -367,12 +370,12 @@ v8sf sin256_ps(v8sf x) { // any x imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 @@ -380,31 +383,31 @@ v8sf sin256_ps(v8sf x) { // any x Both branches will be computed. */ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT + imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT + imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); - imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT + imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -418,9 +421,9 @@ v8sf sin256_ps(v8sf x) { // any x /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT + xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT + xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -429,26 +432,26 @@ v8sf sin256_ps(v8sf x) { // any x x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v8sf *)_ps256_coscof_p0; + y = *(v8sf *)_ps256_coscof_p0; // NOLINT v8sf z = _mm256_mul_ps(x, x); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -475,53 +478,53 @@ v8sf cos256_ps(v8sf x) { // any x #endif /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT y = _mm256_cvtepi32_ps(imm2); - imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); + imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); // NOLINT /* get the swap sign flag */ - imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 29); /* get the polynom selection mask */ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT + imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT + imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); - imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); // NOLINT - imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT + imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -534,9 +537,9 @@ v8sf cos256_ps(v8sf x) { // any x /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT + xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT + xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -545,26 +548,26 @@ v8sf cos256_ps(v8sf x) { // any x x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v8sf *)_ps256_coscof_p0; + y = *(v8sf *)_ps256_coscof_p0; // NOLINT v8sf z = _mm256_mul_ps(x, x); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -595,42 +598,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { sign_bit_sin = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); + sign_bit_sin = + _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); // NOLINT /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 29); // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT + imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT // v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT + imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); @@ -638,16 +642,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { imm4_1 = imm2_1; imm4_2 = imm2_2; - imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT + imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -659,9 +663,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT + xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT + xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -670,15 +674,15 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ - imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); - imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); + imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); // NOLINT + imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); // NOLINT imm4 = avx2_mm256_slli_epi32(imm4, 29); #else - imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); - imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); + imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); // NOLINT + imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); // NOLINT - imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); - imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); + imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); // NOLINT + imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); // NOLINT imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); @@ -692,25 +696,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x, x); - y = *(v8sf *)_ps256_coscof_p0; + y = *(v8sf *)_ps256_coscof_p0; // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -729,3 +733,6 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); } + +} // namespace lite +} // namespace paddle diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 86193235a2984b15a33c2eeaff15865d9f126eeb..18a1243c11652afc181f13f0f5a497858a30885f 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -83,14 +83,11 @@ class KernelBase { #if defined(LITE_WITH_CUDA) WorkSpace::Global_CUDA().AllocReset(); #endif - #ifdef LITE_WITH_PROFILE - CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " - "When LITE_WITH_PROFILE is defined, please set a " - "Profiler for Instruction."; - profiler_->StartTiming(profile_id_, ctx_.get()); + profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); + profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); Run(); - profiler_->StopTiming(profile_id_, ctx_.get()); + profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); #else Run(); #endif diff --git a/lite/core/memory.h b/lite/core/memory.h index 18b9958911a6173c088b415369555235d63d184d..001db760a00596306e1004fbe062f497181b1a85 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -120,6 +120,7 @@ class Buffer { if (space_ > 0) { TargetFree(target_, data_); } + data_ = nullptr; target_ = TargetType::kHost; space_ = 0; } diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc index 78317f78ac6bf7024c1984c2127434d55b738ad6..f4d0e3c0afbe1f9df4e381a502e1800a3d58ba68 100644 --- a/lite/core/profile/profiler.cc +++ b/lite/core/profile/profiler.cc @@ -28,36 +28,55 @@ auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) { }; } -int Profiler::NewTimer(const OpCharacter& ch) { - StatisUnit unit; - unit.character = ch; +std::map TypeStr{ + {Type::kUnk, "Unknown"}, + {Type::kCreate, "Create"}, + {Type::kDispatch, "Dispatch"}, +}; + +StatisUnit::StatisUnit(const OpCharacter& ch) : character(ch) { + create_t.reset(new DeviceTimer()); if (ch.target == TargetType::kCUDA) { #ifdef LITE_WITH_CUDA - unit.timer.reset(new DeviceTimer()); + dispatch_t.reset(new DeviceTimer()); #else LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the " "default x86 timer is used instead."; #endif } else { - unit.timer.reset(new DeviceTimer()); + dispatch_t.reset(new DeviceTimer()); } +} + +lite::profile::Timer* StatisUnit::Timer(Type type) { + if (type == Type::kCreate) { + return create_t.get(); + } else if (type == Type::kDispatch) { + return dispatch_t.get(); + } + LOG(FATAL) << "Timer cannot be returned for unknown platforms."; + return nullptr; +} + +int Profiler::NewTimer(const OpCharacter& ch) { + StatisUnit unit(ch); units_.push_back(std::move(unit)); return units_.size() - 1; } -void Profiler::StartTiming(const int index, KernelContext* ctx) { +void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - units_[index].timer->Start(ctx); + units_[index].Timer(type)->Start(ctx); } -float Profiler::StopTiming(const int index, KernelContext* ctx) { +float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - return units_[index].timer->Stop(ctx); + return units_[index].Timer(type)->Stop(ctx); } -std::string Profiler::Summary(bool concise, size_t w) { +std::string Profiler::Summary(Type type, bool concise, size_t w) { using std::setw; using std::left; using std::fixed; @@ -65,12 +84,14 @@ std::string Profiler::Summary(bool concise, size_t w) { std::string title; // Title. if (concise) { - ss << "Timing cycle = " << units_.front().timer->LapTimes().Size() + ss << "Timing cycle = " << units_.front().Timer(type)->LapTimes().Size() << std::endl; - ss << "===== Concise Profiler Summary: " << name_ << ", Exclude " << w + ss << "===== Concise " << TypeStr.find(type)->second + << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } else { - ss << "===== Detailed Profiler Summary: " << name_ << ", Exclude " << w + ss << "===== Detailed " << TypeStr.find(type)->second + << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } ss << setw(25) << left << "Operator Type" @@ -84,16 +105,16 @@ std::string Profiler::Summary(bool concise, size_t w) { if (concise) { std::map summary(op_comp); for (auto& unit : units_) { - auto ch = summary.find(unit.character); + auto ch = summary.find(unit.Character()); if (ch != summary.end()) { - ch->second.avg += unit.timer->LapTimes().Avg(w); - ch->second.min += unit.timer->LapTimes().Min(w); - ch->second.max += unit.timer->LapTimes().Max(w); + ch->second.avg += unit.Timer(type)->LapTimes().Avg(w); + ch->second.min += unit.Timer(type)->LapTimes().Min(w); + ch->second.max += unit.Timer(type)->LapTimes().Max(w); } else { - TimeInfo info({unit.timer->LapTimes().Avg(w), - unit.timer->LapTimes().Min(w), - unit.timer->LapTimes().Max(w)}); - summary.insert({unit.character, info}); + TimeInfo info({unit.Timer(type)->LapTimes().Avg(w), + unit.Timer(type)->LapTimes().Min(w), + unit.Timer(type)->LapTimes().Max(w)}); + summary.insert({unit.Character(), info}); } } for (const auto& item : summary) { @@ -109,14 +130,15 @@ std::string Profiler::Summary(bool concise, size_t w) { } } else { for (auto& unit : units_) { + const auto& times = unit.Timer(type)->LapTimes(); // clang-format off - ss << setw(25) << left << fixed << unit.character.op_type \ - << " " << setw(40) << left << fixed << unit.character.kernel_name \ - << " " << setw(12) << left << fixed << unit.character.remark \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Avg(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Min(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Max(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Last(w) \ + ss << setw(25) << left << fixed << unit.Character().op_type \ + << " " << setw(40) << left << fixed << unit.Character().kernel_name \ + << " " << setw(12) << left << fixed << unit.Character().remark \ + << " " << setw(12) << left << fixed << times.Avg(w) \ + << " " << setw(12) << left << fixed << times.Min(w) \ + << " " << setw(12) << left << fixed << times.Max(w) \ + << " " << setw(12) << left << fixed << times.Last(w) \ << std::endl; // clang-format on } diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h index 4e9e9ae31c1a6d7f331eac2e77c4971986bd42a1..3933e5ba01ebcb20420494a955cbc0e202879f76 100644 --- a/lite/core/profile/profiler.h +++ b/lite/core/profile/profiler.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -22,6 +23,14 @@ namespace paddle { namespace lite { namespace profile { +enum class Type { + kUnk = 0, + kCreate, + kDispatch, +}; + +extern std::map TypeStr; + struct TimeInfo { float avg; float min; @@ -35,8 +44,15 @@ struct OpCharacter { std::string remark{std::string("N/A")}; }; -struct StatisUnit { - std::unique_ptr timer; +class StatisUnit final { + public: + explicit StatisUnit(const OpCharacter& ch); + lite::profile::Timer* Timer(Type type); + const OpCharacter& Character() const { return character; } + + protected: + std::unique_ptr create_t; + std::unique_ptr dispatch_t; OpCharacter character; }; @@ -45,9 +61,9 @@ class Profiler final { Profiler() = default; explicit Profiler(const std::string& name) : name_(name) {} int NewTimer(const OpCharacter& ch); - void StartTiming(const int index, KernelContext* ctx); - float StopTiming(const int index, KernelContext* ctx); - std::string Summary(bool concise = true, size_t warm_up = 10); + void StartTiming(Type type, const int index, KernelContext* ctx); + float StopTiming(Type type, const int index, KernelContext* ctx); + std::string Summary(Type type, bool concise = true, size_t warm_up = 10); private: std::string name_{std::string("N/A")}; diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc index 6f49698ef4a8f83e4192a16801566fdcbd7baf9a..3841f0151890d377a87f4f5d4b6d069ee75b560e 100644 --- a/lite/core/profile/test_timer.cc +++ b/lite/core/profile/test_timer.cc @@ -69,10 +69,10 @@ TEST(profiler, real_latency) { ch.op_type = "operator/1"; ch.kernel_name = "kernel/1"; int idx = profiler.NewTimer(ch); - profiler.StartTiming(idx, &ctx); + profiler.StartTiming(Type::kDispatch, idx, &ctx); std::this_thread::sleep_for(std::chrono::milliseconds(10)); - profiler.StopTiming(idx, &ctx); - std::cout << profiler.Summary(); + profiler.StopTiming(Type::kDispatch, idx, &ctx); + std::cout << profiler.Summary(Type::kDispatch); } #endif diff --git a/lite/core/program.cc b/lite/core/program.cc index 8dc8fb0dddc54d7d83b2368b31b5f30725469296..a26e97886f6b7cc6bd894093ba744613acf31419 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -147,7 +147,7 @@ void RuntimeProgram::Run() { #endif // LITE_WITH_PROFILE } #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(false, 0); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); #endif // LITE_WITH_PROFILE } @@ -252,8 +252,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) { } void Instruction::Run() { +#ifdef LITE_WITH_PROFILE + CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " + "When LITE_WITH_PROFILE is defined, please set a " + "Profiler for Instruction."; + profiler_->StartTiming( + profile::Type::kCreate, profile_id_, kernel_->mutable_context()); +#endif CHECK(op_) << "op null"; CHECK(kernel_) << "kernel null"; + if (first_epoch_) { first_epoch_ = false; CHECK(op_->CheckShape()); @@ -263,10 +271,7 @@ void Instruction::Run() { return; } - // VLOG(4) << "kernel launch"; op_->InferShape(); - // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target " - // << TargetToStr(kernel_->target()); kernel_->Launch(); has_run_ = true; } diff --git a/lite/core/program.h b/lite/core/program.h index 291252619b396f18576b935a0189f4ecdba7867f..e3f6642d3a41add4af531f64c6ee697c755274e6 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -143,7 +143,8 @@ class LITE_API RuntimeProgram { } ~RuntimeProgram() { #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch); #endif // LITE_WITH_PROFILE } diff --git a/lite/core/tensor.h b/lite/core/tensor.h index 0c454a696a215cf6b923c061cb6367728502db87..b7dd57d9b50bb333aad60eadd5fb54078c39fce9 100644 --- a/lite/core/tensor.h +++ b/lite/core/tensor.h @@ -233,6 +233,10 @@ class TensorLite { (static_cast(buffer_->data()) + offset_)); } + void clear() { + buffer_->Free(); + offset_ = 0; + } size_t data_size() const { return this->dims().production(); } size_t memory_size() const { return memory_size_; } diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/arm/conditional_block_compute.cc index 225709b793d4718545a4077ba469c484fc8b36a3..f0bd43e1300d4034241c03d3e4ce27dcaa59c1e5 100644 --- a/lite/kernels/arm/conditional_block_compute.cc +++ b/lite/kernels/arm/conditional_block_compute.cc @@ -34,6 +34,9 @@ void ConditionalBlockCompute::PrepareForRun() { } void ConditionalBlockCompute::Run() { auto& param = Param(); + for (auto& out : param.outs) { + out->clear(); + } bool need_run = true; if (param.is_scalar_condition) { auto* cond = param.cond; diff --git a/lite/kernels/arm/split_lod_tensor_compute.cc b/lite/kernels/arm/split_lod_tensor_compute.cc index 16603bc5fd5965e525122b76801281b0f48ccae7..8bb5e4ae6b182a8f02e5d72ca763ca0fb0d4122f 100644 --- a/lite/kernels/arm/split_lod_tensor_compute.cc +++ b/lite/kernels/arm/split_lod_tensor_compute.cc @@ -82,6 +82,10 @@ void SplitLodTensorCompute::Run() { ranges.begin(), ranges.end(), 0UL, [](size_t a, const CopyRange &b) { return a + b.end - b.begin; }); + if (height == 0) { + out->clear(); + continue; + } auto x_dim = x->dims(); x_dim[0] = static_cast(height); out->Resize(x_dim); diff --git a/lite/kernels/arm/unsqueeze_compute.cc b/lite/kernels/arm/unsqueeze_compute.cc index e623407c2e718a51b51e880a4d81df4ee0d96f87..91c8c0423b6fcc5bade5751985f190b3395b0779 100644 --- a/lite/kernels/arm/unsqueeze_compute.cc +++ b/lite/kernels/arm/unsqueeze_compute.cc @@ -54,12 +54,12 @@ REGISTER_LITE_KERNEL(unsqueeze, kNCHW, paddle::lite::kernels::host::UnsqueezeCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindInput("AxesTensor", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindInput("AxesTensorList", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); REGISTER_LITE_KERNEL(unsqueeze2, @@ -68,11 +68,11 @@ REGISTER_LITE_KERNEL(unsqueeze2, kNCHW, paddle::lite::kernels::host::Unsqueeze2Compute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindInput("AxesTensor", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindInput("AxesTensorList", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/arm/yolo_box_compute.cc b/lite/kernels/arm/yolo_box_compute.cc index 1336e5e1e0a6438a08f542d299eddc30d15dad15..ad8a630b8c0064af7358674d1b7424eff25a194a 100644 --- a/lite/kernels/arm/yolo_box_compute.cc +++ b/lite/kernels/arm/yolo_box_compute.cc @@ -54,7 +54,8 @@ REGISTER_LITE_KERNEL(yolo_box, paddle::lite::kernels::arm::YoloBoxCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("ImgSize", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/cuda/softmax_compute.cu b/lite/kernels/cuda/softmax_compute.cu index 157c6ae889d322197d4286d6adbca21ade4ad792..431bd6eb561f6213b4609b39e8c3f638fed8261a 100644 --- a/lite/kernels/cuda/softmax_compute.cu +++ b/lite/kernels/cuda/softmax_compute.cu @@ -156,8 +156,8 @@ void SoftmaxCompute::PrepareForRun() { cudaGetDevice(&device_id); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device_id); - sharedmem_size = deviceProp.sharedMemPerBlock; - max_dimsize = sharedmem_size / sizeof(float) / CUDA_NUM_THREADS; + sharedmem_size_ = deviceProp.sharedMemPerBlock; + max_dimsize_ = sharedmem_size_ / sizeof(float) / CUDA_NUM_THREADS; } void SoftmaxCompute::Run() { @@ -174,29 +174,27 @@ void SoftmaxCompute::Run() { int outer_num = x_dims.Slice(0, axis).production(); int inner_num = x_dims.Slice(axis + 1, x_rank).production(); int total_threads = inner_num * outer_num; - int axis_size = x_dims[axis]; + axis_size_ = x_dims[axis]; const int threads = CUDA_NUM_THREADS; const int blocks = (total_threads + threads - 1) / threads; auto input_data = param.x->data(); auto output_data = param.output->mutable_data(TARGET(kCUDA)); - if (axis_size <= max_dimsize) { - int use_sharemem_size = axis_size * threads * sizeof(float); + if (axis_size_ <= max_dimsize_) { + int use_sharemem_size = axis_size_ * threads * sizeof(float); sharemem_softmax_kernel<<>>( total_threads, input_data, output_data, inner_num, outer_num, - axis_size); + axis_size_); } else { //! re_alloc device memory - Tensor tmax_data; - Tensor tsum_data; - tmax_data.Resize({1, 1, 1, outer_num * inner_num}); - tsum_data.Resize({1, 1, 1, outer_num * inner_num}); - auto max_data = tmax_data.mutable_data(TARGET(kCUDA)); - auto sum_data = tsum_data.mutable_data(TARGET(kCUDA)); + tmax_data_.Resize({1, 1, 1, outer_num * inner_num}); + tsum_data_.Resize({1, 1, 1, outer_num * inner_num}); + auto max_data = tmax_data_.mutable_data(TARGET(kCUDA)); + auto sum_data = tsum_data_.mutable_data(TARGET(kCUDA)); //! firstly, get maximum data float min_data = std::numeric_limits::lowest(); softmax_max_kernel<<>>(total_threads, @@ -205,7 +203,7 @@ void SoftmaxCompute::Run() { min_data, inner_num, outer_num, - axis_size); + axis_size_); //! then, compute exp and sum data softmax_sub_exp_sum_kernel<<>>( total_threads, @@ -215,10 +213,10 @@ void SoftmaxCompute::Run() { sum_data, inner_num, outer_num, - axis_size); + axis_size_); //! last, compute divided output softmax_divid_output_kernel<<>>( - total_threads, output_data, sum_data, inner_num, outer_num, axis_size); + total_threads, output_data, sum_data, inner_num, outer_num, axis_size_); } cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); diff --git a/lite/kernels/cuda/softmax_compute.h b/lite/kernels/cuda/softmax_compute.h index 72d43a8eff0e55efe4b08ca8d4a665b35b22e5b9..e563b36178fa0824d77de9942c1ec1a0f0fbd94f 100644 --- a/lite/kernels/cuda/softmax_compute.h +++ b/lite/kernels/cuda/softmax_compute.h @@ -30,9 +30,11 @@ class SoftmaxCompute virtual ~SoftmaxCompute() = default; private: - size_t sharedmem_size; - int num_threads; - int max_dimsize; + lite::Tensor tmax_data_; + lite::Tensor tsum_data_; + size_t sharedmem_size_; + int max_dimsize_; + int axis_size_; }; } // namespace cuda diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h index db39063417d7023d697639236043a66c442ca8fa..61a4e12cf3ad6e3eab608a585f165fde9dec081d 100644 --- a/lite/kernels/npu/bridges/engine.h +++ b/lite/kernels/npu/bridges/engine.h @@ -28,12 +28,14 @@ namespace subgraph { class Engine { public: - Engine(int block_idx, + Engine(KernelContext *ctx, + int block_idx, cpp::BlockDesc *block_desc, const std::vector &input_names, const std::vector &output_names, lite::Scope *scope) - : block_idx_(block_idx), + : ctx_(ctx), + block_idx_(block_idx), block_desc_(block_desc), input_names_(input_names), output_names_(output_names), @@ -55,6 +57,7 @@ class Engine { virtual bool InputShapeChanged(); + KernelContext *ctx_{nullptr}; int block_idx_; cpp::BlockDesc *block_desc_; std::vector input_names_; diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc index c6cbea46fafc5bd6e3d7431be23fbea8bf1c93fa..d9b191950668660ae2b76b70ac2b5c12aece92c0 100644 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -207,7 +207,8 @@ int SubgraphEngine::LaunchDeviceProgram() { void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); - engine_.reset(new SubgraphEngine(param.sub_block_idx, + engine_.reset(new SubgraphEngine(ctx_.get(), + param.sub_block_idx, param.sub_block_desc, param.input_data_names, param.output_data_names, diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h index dd0bf82bc9e743287d2ad4cb81db9a5fdd57c276..27b4a36cfeadf6cca328fb9c980d53c9c5e79095 100644 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -29,13 +29,14 @@ namespace npu { class SubgraphEngine : public subgraph::Engine { public: - SubgraphEngine(int block_idx, + SubgraphEngine(KernelContext *ctx, + int block_idx, cpp::BlockDesc *block_desc, const std::vector &input_names, const std::vector &output_names, Scope *scope) : subgraph::Engine( - block_idx, block_desc, input_names, output_names, scope) {} + ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: int BuildDeviceProgram() override; diff --git a/lite/kernels/x86/gru_compute.cc b/lite/kernels/x86/gru_compute.cc index d8e70833aaa9b4e2914c13f3ae40c84a5083c909..23842957fa7bfc6b0710a5bd9b8644d888a7e7b4 100644 --- a/lite/kernels/x86/gru_compute.cc +++ b/lite/kernels/x86/gru_compute.cc @@ -13,10 +13,13 @@ // limitations under the License. #include "lite/kernels/x86/gru_compute.h" +#include "lite/utils/env.h" -DEFINE_int32(paddle_num_threads, - 1, - "Number of threads for each paddle instance."); +// DEFINE_int32(paddle_num_threads, +// 1, +// "Number of threads for each paddle instance."); +int32_t paddle_num_threads = + paddle::lite::GetIntFromEnv("paddle_num_threads", 1); REGISTER_LITE_KERNEL(gru, kX86, diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h index e3c6f70fdbe3d0e0ff025c7b41528b50ff06fca3..948485105a763aeefbbd7a77b91a7eefdeb17b57 100644 --- a/lite/kernels/x86/gru_compute.h +++ b/lite/kernels/x86/gru_compute.h @@ -26,7 +26,8 @@ #include "lite/core/types.h" #include "lite/fluid/eigen.h" -DECLARE_int32(paddle_num_threads); +// DECLARE_int32(paddle_num_threads); +extern int32_t paddle_num_threads; namespace paddle { namespace lite { @@ -109,7 +110,7 @@ class GRUCompute : public KernelLite { #ifdef PADDLE_WITH_MKLML // use MKL packed to speedup GEMM - if (FLAGS_paddle_num_threads >= 4) { + if (paddle_num_threads >= 4) { auto blas = lite::x86::math::GetBlas(context); T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, diff --git a/lite/kernels/xpu/bridges/matmul_op.cc b/lite/kernels/xpu/bridges/matmul_op.cc index eaf2370ada95e77f25c1b75fa09e19a669c15b93..330b336840148fa54d5c9f2eae39a08fdfad9557 100644 --- a/lite/kernels/xpu/bridges/matmul_op.cc +++ b/lite/kernels/xpu/bridges/matmul_op.cc @@ -49,9 +49,10 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_type = kernel->GetOutputDeclType("Out"); CHECK(out_type->precision() == PRECISION(kFloat)); CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); auto transpose_x = op_info->GetAttr("transpose_X"); - CHECK(!transpose_x) << "XPU only support transpose_x == true now"; auto transpose_y = op_info->GetAttr("transpose_Y"); auto alpha = op_info->GetAttr("alpha"); @@ -71,11 +72,68 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { y_node = graph->AddNode(y_name, y_dims); } - auto matmul_node = - graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y); - graph->AddNode(out_name, graph->builder_.CreateScale(matmul_node, alpha)); - - return SUCCESS; + // Matmul node + if (x_dims.size() > 2 && y_dims.size() >= 2) { + // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N] + // x: [B, M, K], y: [K, N], out: [B, M, N] + // Reshape and transposed X node + if (x_dims.size() != 3) { + auto m = static_cast(x_dims[x_dims.size() - 2]); + auto k = static_cast(x_dims[x_dims.size() - 1]); + x_node = + graph->AddNode(x_name + "/reshape", + graph->builder_.CreateReshape(*x_node, {-1, m, k})); + if (transpose_x) { + x_node = + graph->AddNode(x_name + "/reshape/transpose", + graph->builder_.CreateTranspose(*x_node, {0, 2, 1})); + } + } + // Reshape and transposed Y node + if (y_dims.size() != 3) { + auto k = static_cast(y_dims[y_dims.size() - 2]); + auto n = static_cast(y_dims[y_dims.size() - 1]); + y_node = + graph->AddNode(y_name + "/reshape", + graph->builder_.CreateReshape(*y_node, {-1, k, n})); + if (!transpose_y) { + y_node = + graph->AddNode(y_name + "/reshape/transpose", + graph->builder_.CreateTranspose(*y_node, {0, 2, 1})); + } + } + // Matmul node + auto matmul_node = graph->AddNode( + out_name, graph->builder_.CreateBatchMatmul(*x_node, *y_node)); + if (fabs(alpha - 1) > 1e-6f) { + matmul_node = graph->AddNode( + out_name, graph->builder_.CreateScale(*matmul_node, alpha)); + } + if (out_dims.size() != 3) { + graph->AddNode(out_name, + graph->builder_.CreateReshape( + *matmul_node, CvtShape(out_dims))); + } + } else if (x_dims.size() == 2 && y_dims.size() == 2) { + // x: [M, K], y: [K, N], out: [M, N] + if (transpose_x) { + x_node = graph->AddNode(x_name + "/transpose", + graph->builder_.CreateTranspose(*x_node, {1, 0})); + } + auto matmul_node = graph->AddNode( + out_name, + graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y)); + if (fabs(alpha - 1) > 1e-6f) { + matmul_node = graph->AddNode( + out_name, graph->builder_.CreateScale(*matmul_node, alpha)); + } + } else if (x_dims.size() == 1 && y_dims.size() == 1) { + // x: [K], y: [K], out: [1] + // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N] + LOG(FATAL) << "[XPU] Not supported."; + return FAILED; + } + return REBUILD_WHEN_SHAPE_CHANGED; } } // namespace xpu diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc index 9d2684ac409bd4bdc482c68c3a43e9146d94d35f..40780557457e3ed9b99e1cec2b5bdead7f2564dd 100644 --- a/lite/kernels/xpu/bridges/mul_op.cc +++ b/lite/kernels/xpu/bridges/mul_op.cc @@ -67,15 +67,27 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { x_node = graph->AddNode(x_name + "/reshape", graph->builder_.CreateReshape( - *x_node, {-1, static_cast(y_matrix_dims[0])})); + *x_node, {-1, static_cast(x_matrix_dims[1])})); } // Y node - auto y_const_node = graph->AddNode(y_name, *y, y_matrix_dims); + std::shared_ptr y_node = nullptr; + if (graph->HasNode(y_name)) { + y_node = graph->GetNode(y_name); + } else { + y_node = graph->AddNode(y_name, y_dims); + } + // Flatten Y node + if (y_dims.size() != 2) { + y_node = + graph->AddNode(y_name + "/reshape", + graph->builder_.CreateReshape( + *y_node, {static_cast(y_matrix_dims[0]), -1})); + } // Reshape the matmul node with the inferred shape as the output node auto matmul_node = graph->AddNode( - out_name, graph->builder_.CreateMatmul2D(*x_node, *y_const_node, false)); + out_name, graph->builder_.CreateMatmul2D(*x_node, *y_node, false)); if (out_dims.size() != 2) { graph->AddNode(out_name, graph->builder_.CreateReshape( diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc index 0a7a4d2aa5431d04c19f531ca118ac422417cbba..07a74b045477bcdff0d60913f20e79ff8497705b 100644 --- a/lite/kernels/xpu/subgraph_compute.cc +++ b/lite/kernels/xpu/subgraph_compute.cc @@ -197,7 +197,8 @@ int SubgraphEngine::LaunchDeviceProgram() { void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); - engine_.reset(new SubgraphEngine(param.sub_block_idx, + engine_.reset(new SubgraphEngine(ctx_.get(), + param.sub_block_idx, param.sub_block_desc, param.input_data_names, param.output_data_names, diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h index 2196eb3621d1acb6fb6c76426118d150a8228214..c21a1b7b054fd642f330ee95bff972f581e65c6b 100644 --- a/lite/kernels/xpu/subgraph_compute.h +++ b/lite/kernels/xpu/subgraph_compute.h @@ -29,13 +29,14 @@ namespace xpu { class SubgraphEngine : public subgraph::Engine { public: - SubgraphEngine(int block_idx, + SubgraphEngine(KernelContext *ctx, + int block_idx, cpp::BlockDesc *block_desc, const std::vector &input_names, const std::vector &output_names, Scope *scope) : subgraph::Engine( - block_idx, block_desc, input_names, output_names, scope) {} + ctx, block_idx, block_desc, input_names, output_names, scope) {} protected: int BuildDeviceProgram() override; diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 190cf7194c19a47f377755a9e9b61d890bc1a262..f307cb66acb5b34fea63a42646fc00ca957264bb 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -50,6 +50,7 @@ add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS}) add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS}) add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS}) +add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS}) # 2.basic ops not used in basic models add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) @@ -78,11 +79,9 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS}) add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS}) add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS}) -add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS}) add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS}) add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS}) add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS}) - add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS}) diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc index a88df0e7a902c6cac63eb77377bb0b49ee30c9b3..6b8ee3fed102ac5b6f32bee2bc5a123ea2a167e2 100644 --- a/lite/operators/attention_padding_mask_op.cc +++ b/lite/operators/attention_padding_mask_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/operators/attention_padding_mask_op.h" +#include #include "lite/core/op_registry.h" #include "lite/core/scope.h" @@ -39,7 +40,8 @@ bool AttentionPaddingMaskOp::InferShape() const { << "Mismatch batch size, bottom0: " << att_batch << ", bottom1: " << src_batch; - param_.pad_begin->Resize({static_cast(src_batch)}); + param_.pad_begin->Resize( + std::vector({static_cast(src_batch)})); param_.Out->Resize(param_.X->dims()); param_.Out->set_lod(param_.X->lod()); diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc index 510402ba1fb363f383b3cba8eb322a4ff7975c18..261b647721a1647664b74bc066e3d8b49185625d 100644 --- a/lite/operators/instance_norm_op.cc +++ b/lite/operators/instance_norm_op.cc @@ -46,8 +46,9 @@ bool InstanceNormOp::InferShape() const { auto x_dims = param_.x->dims(); int64_t batch_size = x_dims[0]; int64_t channel_size = x_dims[1]; - param_.saved_mean->Resize({batch_size * channel_size}); - param_.saved_variance->Resize({batch_size * channel_size}); + param_.saved_mean->Resize(std::vector({batch_size * channel_size})); + param_.saved_variance->Resize( + std::vector({batch_size * channel_size})); param_.out->Resize(x_dims); return true; } diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc index 90da13c8643fa030c376ca25cb3a67b70f3485a4..d82dedc8ea5767f9d86bc3df5e5fa50c071696f0 100644 --- a/lite/operators/reduce_prod_op.cc +++ b/lite/operators/reduce_prod_op.cc @@ -50,7 +50,7 @@ bool ReduceProdOpLite::InferShape() const { if (keep_dim) { out->Resize({static_cast(x_rank), 1}); } else { - out->Resize({1}); + out->Resize(std::vector({1L})); } } else { auto dims_vector = x_dims.Vectorize(); diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 7cd7f5363c9e6d2a58b0b9f7b786feef904a41a9..d2794f6c847727e5d539cdebde3bd769189b73bc 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -30,6 +30,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) if(LITE_BUILD_EXTRA) lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e70f443985536cb6493558cc6e9aee4584d969f5 --- /dev/null +++ b/lite/tests/kernels/mul_compute_test.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class MulComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string type_ = "mul"; + std::string x_ = "x"; + std::string y_ = "y"; + std::string out_ = "out"; + DDim x_dims_{{1, 2}}; + DDim y_dims_{{2, 1}}; + int x_num_col_dims_{1}; + int y_num_col_dims_{1}; + + public: + MulComputeTester(const Place& place, + const std::string& alias, + DDim x_dims, + DDim y_dims, + int x_num_col_dims, + int y_num_col_dims) + : TestCase(place, alias), + x_dims_(x_dims), + y_dims_(y_dims), + x_num_col_dims_(x_num_col_dims), + y_num_col_dims_(y_num_col_dims) {} + + void RunBaseline(Scope* scope) override { + auto* x = scope->FindTensor(x_); + auto* y = scope->FindTensor(y_); + auto x_mat_dims = x_dims_.Flatten2D(x_num_col_dims_); + auto y_mat_dims = y_dims_.Flatten2D(y_num_col_dims_); + CHECK_EQ(x_mat_dims[1], y_mat_dims[0]); + + auto* out = scope->NewTensor(out_); + CHECK(out); + std::vector out_shape; + for (int i = 0; i < x_num_col_dims_; i++) { + out_shape.push_back(x_dims_[i]); + } + for (int i = y_num_col_dims_; i < y_dims_.size(); i++) { + out_shape.push_back(y_dims_[i]); + } + out->Resize(DDim(out_shape)); + + auto x_data = x->data(); + auto y_data = y->data(); + auto* out_data = out->mutable_data(); + + const int M = x_mat_dims[0]; + const int K = x_mat_dims[1]; + const int N = y_mat_dims[1]; + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + out_data[m * N + n] = 0; + for (int k = 0; k < K; ++k) { + out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n]; + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType(type_); + op_desc->SetInput("X", {x_}); + op_desc->SetInput("Y", {y_}); + op_desc->SetOutput("Out", {out_}); + op_desc->SetAttr("x_num_col_dims", x_num_col_dims_); + op_desc->SetAttr("y_num_col_dims", y_num_col_dims_); + } + + void PrepareData() override { + std::vector x(x_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + SetCommonTensor(x_, x_dims_, x.data()); + + std::vector y(y_dims_.production()); + fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); + SetCommonTensor(y_, y_dims_, y.data()); + } +}; + +void TestMul(const std::vector& x_dims, + const std::vector& y_dims, + int x_num_col_dims, + int y_num_col_dims, + const Place& place, + float abs_error) { + std::unique_ptr tester(new MulComputeTester(place, + "def", + DDim(x_dims), + DDim(y_dims), + x_num_col_dims, + y_num_col_dims)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); +} + +TEST(Mul, precision) { + LOG(INFO) << "test mul op"; + float abs_error = 2e-5; + Place place; +#if defined(LITE_WITH_XPU) + place = TARGET(kXPU); +#else + return; +#endif + + TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error); + TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error); + TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error); + TestMul({4, 60}, {5, 4, 3, 2}, 1, 3, place, abs_error); + TestMul({2, 3, 4, 5}, {60, 4}, 1, 1, place, abs_error); + TestMul({2, 3, 4, 5}, {20, 4}, 2, 1, place, abs_error); + TestMul({2, 3, 4, 5}, {5, 4}, 3, 1, place, abs_error); + TestMul({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1, place, abs_error); + TestMul({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2, place, abs_error); + TestMul({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2, place, abs_error); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc index 22e475672a87dafee29d68a3824e4f8ac0c15615..590d3fd29c37e16cfeec53557a825a4acf9684ca 100644 --- a/lite/tests/kernels/unsqueeze_compute_test.cc +++ b/lite/tests/kernels/unsqueeze_compute_test.cc @@ -107,6 +107,7 @@ class UnsqueezeComputeTester : public arena::TestCase { } void PrepareData() override { + SetPrecisionType(out_, PRECISION(kFloat)); std::vector in_data(dims_.production()); for (int i = 0; i < dims_.production(); ++i) { in_data[i] = i; @@ -213,6 +214,7 @@ class Unsqueeze2ComputeTester : public arena::TestCase { } void PrepareData() override { + SetPrecisionType(out_, PRECISION(kFloat)); std::vector in_data(dims_.production()); for (int i = 0; i < dims_.production(); ++i) { in_data[i] = i; diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index fc0117d2f2188632e8b15f426eab1929dc94fbd4..91afc5039cf1c863038cb6c8c5ce79aa856edf04 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -1042,23 +1042,6 @@ function main { build_test_arm_subtask_armlinux shift ;; - build_test_arm_model_mobilenetv1) - build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1 - build_test_arm_subtask_model test_mobilenetv1_int8 MobileNetV1_quant - shift - ;; - build_test_arm_model_mobilenetv2) - build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu - shift - ;; - build_test_arm_model_resnet50) - build_test_arm_subtask_model test_resnet50 resnet50 - shift - ;; - build_test_arm_model_inceptionv4) - build_test_arm_subtask_model test_inceptionv4 inception_v4_simple - shift - ;; check_style) check_style shift diff --git a/lite/utils/env.h b/lite/utils/env.h new file mode 100644 index 0000000000000000000000000000000000000000..86af8c9e7e0749e75b35bbf23ff4c1d903ad5764 --- /dev/null +++ b/lite/utils/env.h @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include +#include + +namespace paddle { +namespace lite { + +static std::string GetStringFromEnv(const std::string& str, + const std::string& def = "") { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return std::string(variable); +} + +static bool GetBoolFromEnv(const std::string& str, bool def = false) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + if (strcmp(variable, "false") == 0 || strcmp(variable, "0") == 0) { + return false; + } else { + return true; + } +} + +static int GetIntFromEnv(const std::string& str, int def = 0) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return atoi(variable); +} + +static double GetDoubleFromEnv(const std::string& str, double def = 0.0) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return atof(variable); +} + +static uint64_t GetUInt64FromEnv(const std::string& str, uint64_t def = 0ul) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return static_cast(atol(variable)); +} + +} // namespace lite +} // namespace paddle diff --git a/mobile/src/framework/cl/cl_image.cpp b/mobile/src/framework/cl/cl_image.cpp index 0d4cf87db0d34953936d107b6bb6c9adbd985560..1b8966742d77db8c63d89ab4ca8176494ba7cab0 100644 --- a/mobile/src/framework/cl/cl_image.cpp +++ b/mobile/src/framework/cl/cl_image.cpp @@ -18,6 +18,37 @@ limitations under the License. */ namespace paddle_mobile { namespace framework { +void CLImage::PrintTensor(const CLImage &cl_image) const { + size_t width = cl_image.ImageDims()[0]; + size_t height = cl_image.ImageDims()[1]; + + half_t *image_data = new half_t[height * width * 4]; + cl_int err; + cl_mem image = cl_image.GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, + region, 0, 0, image_data, 0, NULL, NULL); + + CL_CHECK_ERRORS(err); + + PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0, + "cl_image numel should not be 0 "); + float *tensor_data = new float[cl_image.numel()]; + auto converter = cl_image.Converter(); + converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(), + cl_image.dims()); + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + + for (int i = 0; i < cl_image.numel(); i++) { + printf("%f \n", tensor_data[i]); + } + + delete[](tensor_data); + delete[](image_data); +} + void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context, cl_command_queue commandQueue, cl_kernel kernel) { tensor->mutable_data(); diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h index f41d0ed659e1fe529d8662fe6ebb3e9f56e2d09d..d3d48cda8b86b07e76658ef903863268042ab36f 100644 --- a/mobile/src/framework/cl/cl_image.h +++ b/mobile/src/framework/cl/cl_image.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include @@ -285,6 +286,7 @@ class CLImage { cl_event GetClEvent() const { return cl_event_.get(); } CLImageConverterBase *Converter() const { return image_converter_; } + void PrintTensor(const CLImage &cl_image) const; private: void InitCLImage(cl_context context, size_t width, size_t height, diff --git a/mobile/src/framework/cl/cl_tool.h b/mobile/src/framework/cl/cl_tool.h index 25d5bfc584b59e4fe9d22a922b601f8c32892fd1..ccc97779ece91b881312b031a92a6992ba5fed86 100644 --- a/mobile/src/framework/cl/cl_tool.h +++ b/mobile/src/framework/cl/cl_tool.h @@ -21,13 +21,14 @@ namespace framework { const char* opencl_error_to_str(cl_int error); -#define CL_CHECK_ERRORS(ERR) \ - if (ERR != CL_SUCCESS) { \ - printf( \ - "OpenCL error with code %s happened in file %s at line %d. " \ - "Exiting.\n", \ - paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ - __LINE__); \ +#define CL_CHECK_ERRORS(ERR) \ + if (ERR != CL_SUCCESS) { \ + printf( \ + "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \ + "%d. " \ + "Exiting.\033[0m\n", \ + paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ + __LINE__); \ } } // namespace framework diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp index 169ab63307b37af89e4c19a7dcb41d4b640d3db5..d03cefe59a221093d4e5fb4e86273b3007097d9f 100644 --- a/mobile/src/framework/executor.cpp +++ b/mobile/src/framework/executor.cpp @@ -363,7 +363,10 @@ void Executor::InitNoPersistableMemory(const Tensor &input_tensor) { DLOG << "InitNoPersistableMemory var " << var_desc->Name(); auto tensor = var->template GetMutable(); if (tensor->IsInitialized() && tensor->dims().size() == 4) { - DLOG << "var's tensor is Initialized or dims size != 4"; + // don't change user's input and avoid memory leaks + if (feed_indices_.find(var_desc->Name()) != feed_indices_.end()) { + break; + } DDim tensor_dim = tensor->dims(); DDim new_dim = make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2], diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp index 72d94ced5d11557c4961d0cc0a8f416355ae217c..a4dfd8321edbcc24b1d942bbe55abbdddba009c1 100644 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp +++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp @@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, cl_int status; int index = 0; - if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) { + const int filter_height = param.Filter()->dims()[2]; + const int filter_width = param.Filter()->dims()[3]; + if (filter_height == 1 && filter_width == 1) { status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); CL_CHECK_ERRORS(status); @@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); CL_CHECK_ERRORS(status); - if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) { + if (filter_height == 3 && filter_width == 3) { // normal conv if (param.Filter()->dims()[0] == param.Output()->dims()[1] && param.Filter()->dims()[1] == param.Input()->dims()[1]) { @@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &group); CL_CHECK_ERRORS(status); } + } else if (filter_height != 3 && filter_width != 3) { + // not 3x3 + if (param.Filter()->dims()[1] == 1 && + param.Input()->dims()[1] == param.Output()->dims()[1]) { + // deepwise basic use in not 3x3 + status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); + CL_CHECK_ERRORS(status); + } } status = clEnqueueNDRangeKernel( diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl old mode 100755 new mode 100644 index d3078e6a5c09a400fe90c8cb9eda7cf091eda381..bf31f329708aacac59f3a67cf987998a8a4a28dd --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -24,980 +24,1101 @@ conv_add_bn_relu #include "cl_common.h" -__kernel void conv_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, - +__kernel void conv_3x3( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height, - __private const int output_c, - __private const int filter_channel, - __private const int group) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height, + __private const int output_c, __private const int filter_channel, + __private const int group) { - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; -#endif - - half4 input[9]; - if (group == 1) { - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - input[0] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[1] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[2] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[3] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[4] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[5] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[6] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[7] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[8] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - -/* - for (int j = 0; j < 9; ++j) { - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - } -*/ - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); + half4 output = 0.0f; +#endif + half4 input[9]; + if (group == 1) { + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + input[0] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[1] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[2] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[3] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[4] = select( + read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[5] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[6] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[7] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[8] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + /* + for (int j = 0; j < 9; ++j) { + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, + pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, + pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, + pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, + pos_of_weight); + output.w += dot(input[j], weight_w); + } + */ + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + } else { + for (int i = 0; i < 4; i++) { + int used_input_channel_num = + (out_c * 4 + i) / (output_c / group) * filter_channel; + for (int f_c = 0; f_c < filter_channel; ++f_c) { + int input_c = used_input_channel_num + f_c; + int input_block = input_c / 4; + int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + input[0] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[1] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[2] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[3] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[4] = select( + read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[5] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[6] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[7] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[8] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + half tmp_out = 0; + for (int j = 0; j < 9; j++) { + int2 pos_of_weight; + pos_of_weight.x = (f_c / 4) * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; + half4 weight = read_imageh(filter, sampler, pos_of_weight); + int f_c_offset = f_c % 4; + half f_value; + if (f_c_offset == 0) { + f_value = weight.x; + } else if (f_c_offset == 1) { + f_value = weight.y; + } else if (f_c_offset == 2) { + f_value = weight.z; + } else if (f_c_offset == 3) { + f_value = weight.w; + } + int input_c_offset = input_c % 4; + half input_value; + if (input_c_offset == 0) { + input_value = input[j].x; + } else if (input_c_offset == 1) { + input_value = input[j].y; + } else if (input_c_offset == 2) { + input_value = input[j].z; + } else if (input_c_offset == 3) { + input_value = input[j].w; + } + tmp_out += f_value * input_value; } - } else { - for (int i = 0; i < 4; i++) { - int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel; - for (int f_c = 0; f_c < filter_channel; ++f_c) { - int input_c = used_input_channel_num + f_c; - int input_block = input_c / 4; - int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - input[0] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - input[1] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - input[2] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - input[3] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[4] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[5] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[6] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - input[7] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - input[8] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - half tmp_out = 0; - for (int j = 0; j < 9; j++) { - int2 pos_of_weight; - pos_of_weight.x = (f_c / 4) * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; - half4 weight = read_imageh(filter, sampler, pos_of_weight); - int f_c_offset = f_c % 4; - half f_value; - if (f_c_offset == 0) { - f_value = weight.x; - } else if (f_c_offset == 1) { - f_value = weight.y; - } else if (f_c_offset == 2) { - f_value = weight.z; - } else if (f_c_offset == 3) { - f_value = weight.w; - } - int input_c_offset = input_c % 4; - half input_value; - if (input_c_offset == 0) { - input_value = input[j].x; - } else if (input_c_offset == 1) { - input_value = input[j].y; - } else if (input_c_offset == 2) { - input_value = input[j].z; - } else if (input_c_offset == 3) { - input_value = input[j].w; - } - tmp_out += f_value * input_value; - } - - if (i == 0) { - output.x += tmp_out; - } else if (i == 1) { - output.y += tmp_out; - } else if (i == 2) { - output.z += tmp_out; - } else if (i == 3) { - output.w += tmp_out; - } - } + + if (i == 0) { + output.x += tmp_out; + } else if (i == 1) { + output.y += tmp_out; + } else if (i == 2) { + output.z += tmp_out; + } else if (i == 3) { + output.w += tmp_out; } + } } - + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } - // dilation == 1 -__kernel void conv_3x3spl(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, +// dilation == 1 +__kernel void conv_3x3spl( + __private const int item_ch, __private const int item_w, + __private const int item_h, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h) { - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int pad, __private const int dilation, + __private const int in_ch, __private const int in_w, + __private const int in_h, __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; #ifdef BIASE_CH - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; #elif defined(BIASE_ELE) - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); - } + half4 output[5]; + output[0] = + read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } #else - half4 output[5] = {0.0f}; -#endif - - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - half4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * 3; - int filter_h_val1 = filter_h_val0 + 3; - int filter_h_val2 = filter_h_val1 + 3; - int filter_h_val3 = filter_h_val2 + 3; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * 3; - - for (int h = 0; h < 3; h++) { - - int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, - (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); - - for (int w = 0; w < 3; w++) { - - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); - input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); - input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); - input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); - input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } + half4 output[5] = {0.0f}; +#endif + + half4 filter[4] = {0.0f}; + half4 filter_trans[4] = {0.0f}; + half4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, + (out_batch_id * in_h + in_h_id + h < 0 || + out_batch_id * in_h + in_h_id + h >= in_h)); + + for (int w = 0; w < 3; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = + read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = + read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = + read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = + read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = + read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); } + } } + } #ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output[0] = mad(scale, output[0], biase); - if (out_w_id1 < out_w) { - output[1] = mad(scale, output[1], biase); - } - if (out_w_id2 < out_w) { - output[2] = mad(scale, output[2], biase); - } - if (out_w_id3 < out_w) { - output[3] = mad(scale, output[3], biase); - } - if (out_w_id4 < out_w) { - output[4] = mad(scale, output[4], biase); - } + half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); + output[0] = mad(scale, output[0], biase); + if (out_w_id1 < out_w) { + output[1] = mad(scale, output[1], biase); + } + if (out_w_id2 < out_w) { + output[2] = mad(scale, output[2], biase); + } + if (out_w_id3 < out_w) { + output[3] = mad(scale, output[3], biase); + } + if (out_w_id4 < out_w) { + output[4] = mad(scale, output[4], biase); + } #endif #ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); - output[2] = activation(output[2]); - output[3] = activation(output[3]); - output[4] = activation(output[4]); -#endif - write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); - if (out_w_id1 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); - } - if (out_w_id2 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); - } - if (out_w_id3 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); - } - if (out_w_id4 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); - } + output[0] = activation(output[0]); + output[1] = activation(output[1]); + output[2] = activation(output[2]); + output[3] = activation(output[3]); + output[4] = activation(output[4]); +#endif + write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } } - - -__kernel void depth_conv_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, +__kernel void depth_conv_3x3( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const int batch_index = out_nh / output_height; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - const int out_nh_in_one_batch = out_nh % output_height; + const int batch_index = out_nh / output_height; + const int out_nh_in_one_batch = out_nh % output_height; - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; + half4 output = 0.0f; #endif - const int filter_width = 3; - const int filter_height = 3; - - int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); - - int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); - - int filter_x = pos_in_filter_block.x ; - int filter_y = pos_in_filter_block.y ; - - half4 inputs[9]; - - inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - /* - if (output_pos.x == 112 && output_pos.y == 0) { - half4 input1 = inputs[3]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 3 - %v4hlf \n", in); - printf(" --- %d ---\n", in_pos_in_one_block.x - 1); - } - */ - - - inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - half4 filters[9]; - filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); - filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - for(int i = 0 ;i < 9 ; i++){ - output += inputs[i] * filters[i]; - } + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = + (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = + (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x; + int filter_y = pos_in_filter_block.y; + + half4 inputs[9]; + + inputs[0] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[1] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[2] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[3] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + inputs[4] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[5] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[6] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[7] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[8] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + half4 filters[9]; + filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y)); + filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + for (int i = 0; i < 9; i++) { + output += inputs[i] * filters[i]; + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif + /* + if (output_pos.x == 112 && output_pos.y == 0) { + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + */ - /* - - if (output_pos.x == 112 && output_pos.y == 0) { - - for (int i = 0; i < 9; ++i) { - half4 input1 = inputs[i]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 %d - %v4hlf \n", i, in); - } - - float4 out = (float4)(output.x, output.y, output.z, output.w); - printf(" depth wise output output4 = %v4hlf \n", out); - printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); - printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); - printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); - printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); - } - - */ - - write_imageh(output_image, output_pos, output); - + write_imageh(output_image, output_pos, output); } - - -__kernel void depth_conv_3x3s1(__private const int ou_ch_blk, - __private const int ou_w_blk, - __private const int ou_nh, - __read_only image2d_t input, - __read_only image2d_t filter, +__kernel void depth_conv_3x3s1( + __private const int ou_ch_blk, __private const int ou_w_blk, + __private const int ou_nh, __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w,/* of one block */ - __private const int in_h, /* of one block */ - __private const int ou_w, - __private const int ou_h) { - - const int ou_ch_blk_id = get_global_id(0); - const int ou_w_blk_id = get_global_id(1); - const int ou_nh_id = get_global_id(2); - const int w_blk_size = 2; - - const int batch_id = ou_nh_id / ou_h; - int ou_col_id = ou_w_blk_id * w_blk_size; - int ou_row_id = ou_nh_id % ou_h; - int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - - // input pos in one block and on batch - int col_id = ou_col_id - pad; - int row_id = ou_row_id - pad; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - -#ifdef BIASE_CH - half4 output[2]; - output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0)); - output[1] = output[0]; -#elif defined(BIASE_ELE) - half4 output[2]; - output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id)); - if (ou_col_id + 1 < ou_w) { - output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id)); - } -#else - half4 output[2] = {0.0f}; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif + __write_only image2d_t output_image, __private const int stride, + __private const int pad, __private const int dilation, + __private const int in_ch, __private const int in_w, /* of one block */ + __private const int in_h, /* of one block */ + __private const int ou_w, __private const int ou_h) { - half4 inputs[12]; + const int ou_ch_blk_id = get_global_id(0); + const int ou_w_blk_id = get_global_id(1); + const int ou_nh_id = get_global_id(2); + const int w_blk_size = 2; - int filter_x = ou_ch_blk_id * 3; - int filter_y = 0; - half4 filters[9]; - filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); + const int batch_id = ou_nh_id / ou_h; + int ou_col_id = ou_w_blk_id * w_blk_size; + int ou_row_id = ou_nh_id % ou_h; + int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - int in_x = mad24(ou_ch_blk_id, in_w, col_id); - int in_y = mad24(batch_id, in_h, row_id); + // input pos in one block and on batch + int col_id = ou_col_id - pad; + int row_id = ou_row_id - pad; - int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); - int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); - inputs[0] = read_imageh(input, sampler, (int2)(x0, y0)); - int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); - inputs[1] = read_imageh(input, sampler, (int2)(x1, y0)); - int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); - inputs[2] = read_imageh(input, sampler, (int2)(x2, y0)); - int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); - inputs[3] = read_imageh(input, sampler, (int2)(x3, y0)); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - output[0] = mad(inputs[0], filters[0], output[0]); - output[1] = mad(inputs[1], filters[0], output[1]); +#ifdef BIASE_CH + half4 output[2]; + output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0)); + output[1] = output[0]; +#elif defined(BIASE_ELE) + half4 output[2]; + output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id)); + } +#else + half4 output[2] = {0.0f}; +#endif - output[0] = mad(inputs[1], filters[1], output[0]); - output[1] = mad(inputs[2], filters[1], output[1]); + half4 inputs[12]; - output[0] = mad(inputs[2], filters[2], output[0]); - output[1] = mad(inputs[3], filters[2], output[1]); + int filter_x = ou_ch_blk_id * 3; + int filter_y = 0; + half4 filters[9]; + filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y)); + int in_x = mad24(ou_ch_blk_id, in_w, col_id); + int in_y = mad24(batch_id, in_h, row_id); - filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); + int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); + int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); + inputs[0] = read_imageh(input, sampler, (int2)(x0, y0)); + int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); + inputs[1] = read_imageh(input, sampler, (int2)(x1, y0)); + int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); + inputs[2] = read_imageh(input, sampler, (int2)(x2, y0)); + int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); + inputs[3] = read_imageh(input, sampler, (int2)(x3, y0)); + output[0] = mad(inputs[0], filters[0], output[0]); + output[1] = mad(inputs[1], filters[0], output[1]); - int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); - inputs[4] = read_imageh(input, sampler, (int2)(x0, y1)); - inputs[5] = read_imageh(input, sampler, (int2)(x1, y1)); - inputs[6] = read_imageh(input, sampler, (int2)(x2, y1)); - inputs[7] = read_imageh(input, sampler, (int2)(x3, y1)); + output[0] = mad(inputs[1], filters[1], output[0]); + output[1] = mad(inputs[2], filters[1], output[1]); + output[0] = mad(inputs[2], filters[2], output[0]); + output[1] = mad(inputs[3], filters[2], output[1]); - output[0] = mad(inputs[4], filters[3], output[0]); - output[1] = mad(inputs[5], filters[3], output[1]); + filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1)); - output[0] = mad(inputs[5], filters[4], output[0]); - output[1] = mad(inputs[6], filters[4], output[1]); + int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); + inputs[4] = read_imageh(input, sampler, (int2)(x0, y1)); + inputs[5] = read_imageh(input, sampler, (int2)(x1, y1)); + inputs[6] = read_imageh(input, sampler, (int2)(x2, y1)); + inputs[7] = read_imageh(input, sampler, (int2)(x3, y1)); - output[0] = mad(inputs[6], filters[5], output[0]); - output[1] = mad(inputs[7], filters[5], output[1]); + output[0] = mad(inputs[4], filters[3], output[0]); + output[1] = mad(inputs[5], filters[3], output[1]); + output[0] = mad(inputs[5], filters[4], output[0]); + output[1] = mad(inputs[6], filters[4], output[1]); - filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); + output[0] = mad(inputs[6], filters[5], output[0]); + output[1] = mad(inputs[7], filters[5], output[1]); - int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); - inputs[8] = read_imageh(input, sampler, (int2)(x0, y2)); - inputs[9] = read_imageh(input, sampler, (int2)(x1, y2)); - inputs[10] = read_imageh(input, sampler, (int2)(x2, y2)); - inputs[11] = read_imageh(input, sampler, (int2)(x3, y2)); + filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); + inputs[8] = read_imageh(input, sampler, (int2)(x0, y2)); + inputs[9] = read_imageh(input, sampler, (int2)(x1, y2)); + inputs[10] = read_imageh(input, sampler, (int2)(x2, y2)); + inputs[11] = read_imageh(input, sampler, (int2)(x3, y2)); - output[0] = mad(inputs[8], filters[6], output[0]); - output[1] = mad(inputs[9], filters[6], output[1]); + output[0] = mad(inputs[8], filters[6], output[0]); + output[1] = mad(inputs[9], filters[6], output[1]); - output[0] = mad(inputs[9], filters[7], output[0]); - output[1] = mad(inputs[10], filters[7], output[1]); + output[0] = mad(inputs[9], filters[7], output[0]); + output[1] = mad(inputs[10], filters[7], output[1]); - output[0] = mad(inputs[10], filters[8], output[0]); - output[1] = mad(inputs[11], filters[8], output[1]); + output[0] = mad(inputs[10], filters[8], output[0]); + output[1] = mad(inputs[11], filters[8], output[1]); #ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0)); - output[0] = mad(scale, output[0], biase); - if (ou_col_id + 1 < ou_w) { - output[1] = mad(scale, output[1], biase); - } + half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0)); + output[0] = mad(scale, output[0], biase); + if (ou_col_id + 1 < ou_w) { + output[1] = mad(scale, output[1], biase); + } #endif #ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); + output[0] = activation(output[0]); + output[1] = activation(output[1]); #endif - write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]); - if (ou_col_id + 1 < ou_w) { - write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); - } - + write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]); + if (ou_col_id + 1 < ou_w) { + write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); + } } -__kernel void conv_1x1(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void conv_1x1( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const uint kernelHXW = 1; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; + half4 output = 0.0f; #endif - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - half4 input = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); -/* - output.x = dot(input, weight0); - output.y = dot(input, weight1); - output.z = dot(input, weight2); - output.w = dot(input, weight3); -*/ + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); - output = mad(input.x, weight0, output); - output = mad(input.y, weight1, output); - output = mad(input.z, weight2, output); - output = mad(input.w, weight3, output); + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + /* + output.x = dot(input, weight0); + output.y = dot(input, weight1); + output.z = dot(input, weight2); + output.w = dot(input, weight3); + */ - } + output = mad(input.x, weight0, output); + output = mad(input.y, weight1, output); + output = mad(input.z, weight2, output); + output = mad(input.w, weight3, output); + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU @@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple( __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c,__private const int input_c_origin, - __private const int dilation, + __private const int offset, __private const int input_c, + __private const int input_c_origin, __private const int dilation, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w -) { + __private const int output_width, __private const int output_height, + __private const int old_w) { half zero = 0.0f; const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple( int out_w2 = out_w + global_size_dim1 * 2; int out_w3 = out_w + global_size_dim1 * 3; - int outpos_main = mul24(out_c , old_w); + int outpos_main = mul24(out_c, old_w); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); @@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple( #ifdef BIASE_CH half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = output0; - half4 output2 = output0; - half4 output3 = output0; + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; #elif defined(BIASE_ELE) half4 output0 = read_imageh(bias, sampler, output_pos0); - half4 output1 = output0; - half4 output2 = output0; - half4 output3 = output0; + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; #else half4 output0 = 0.0f; @@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple( for (int i = 0; i < input_c; ++i) { // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, + in_pos_in_one_block0.y); half4 input0 = read_imageh(input_image, sampler, pos_in); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); @@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple( output0 = mad(input0.z, weight2, output0); output0 = mad(input0.w, weight3, output0); // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, + in_pos_in_one_block1.y); half4 input1 = read_imageh(input_image, sampler, pos_in); output1 = mad(input1.x, weight0, output1); @@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple( output1 = mad(input1.w, weight3, output1); // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, + in_pos_in_one_block2.y); half4 input2 = read_imageh(input_image, sampler, pos_in); output2 = mad(input2.x, weight0, output2); @@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple( output2 = mad(input2.w, weight3, output2); // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, + in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); output3 = mad(input3.x, weight0, output3); @@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple( #ifdef BATCH_NORM output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); #endif if (out_w0 < old_w) { write_imageh(output_image, output_pos0, output0); } - if (out_w1 < old_w){ + if (out_w1 < old_w) { write_imageh(output_image, output_pos1, output1); } - if (out_w2 < old_w){ + if (out_w2 < old_w) { write_imageh(output_image, output_pos2, output2); } - if (out_w3 < old_w){ + if (out_w3 < old_w) { write_imageh(output_image, output_pos3, output3); } } @@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped( __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c,__private const int input_c_origin, - __private const int dilation, + __private const int offset, __private const int input_c, + __private const int input_c_origin, __private const int dilation, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w - ) { + __private const int output_width, __private const int output_height, + __private const int old_w) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped( int out_w2 = out_w + global_size_dim1 * 2; int out_w3 = out_w + global_size_dim1 * 3; - int outpos_main = mul24(out_c , old_w); + int outpos_main = mul24(out_c, old_w); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); @@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped( ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output0 = read_imageh(bias, sampler, output_pos0); - half4 output1 = read_imageh(bias, sampler, output_pos1); - half4 output2 = read_imageh(bias, sampler, output_pos2); - half4 output3 = read_imageh(bias, sampler, output_pos3); + half4 output0 = read_imageh(bias, sampler, output_pos0); + half4 output1 = read_imageh(bias, sampler, output_pos1); + half4 output2 = read_imageh(bias, sampler, output_pos2); + half4 output3 = read_imageh(bias, sampler, output_pos3); #else half4 output0 = 0.0f; @@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped( int burndary_index = input_c * 4 - input_c_origin; for (int i = 0; i < input_c; ++i) { // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, + in_pos_in_one_block0.y); half4 input0 = read_imageh(input_image, sampler, pos_in); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); @@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped( half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){ - if (burndary_index==0){ + if ((max_w_bound - pos_in.x - 1) < input_width && + (max_w_bound - pos_in.x - 1) >= 0) { + if (burndary_index == 0) { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(input0.z, weight2, output0); output0 = mad(input0.w, weight3, output0); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(input0.z, weight2, output0); output0 = mad(0.0f, weight3, output0); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(0.0f, weight2, output0); output0 = mad(0.0f, weight3, output0); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output0 = mad(input0.x, weight0, output0); output0 = mad(0.0f, weight1, output0); output0 = mad(0.0f, weight2, output0); output0 = mad(0.0f, weight3, output0); } - }else { + } else { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(input0.z, weight2, output0); @@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped( } // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, + in_pos_in_one_block1.y); half4 input1 = read_imageh(input_image, sampler, pos_in); - if (abs(max_w_bound - pos_in.x) < input_width){ - if (burndary_index==0){ + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(input1.z, weight2, output1); output1 = mad(input1.w, weight3, output1); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(input1.z, weight2, output1); output1 = mad(0.0f, weight3, output1); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(0.0f, weight2, output1); output1 = mad(0.0f, weight3, output1); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output1 = mad(input1.x, weight0, output1); output1 = mad(0.0f, weight1, output1); output1 = mad(0.0f, weight2, output1); output1 = mad(0.0f, weight3, output1); } - }else { + } else { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(input1.z, weight2, output1); @@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped( } // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, + in_pos_in_one_block2.y); half4 input2 = read_imageh(input_image, sampler, pos_in); - if (abs(max_w_bound - pos_in.x) < input_width){ - if (burndary_index==0){ + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(input2.z, weight2, output2); output2 = mad(input2.w, weight3, output2); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(input2.z, weight2, output2); output2 = mad(0.0f, weight3, output2); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(0.0f, weight2, output2); output2 = mad(0.0f, weight3, output2); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output2 = mad(input2.x, weight0, output2); output2 = mad(0.0f, weight1, output2); output2 = mad(0.0f, weight2, output2); output2 = mad(0.0f, weight3, output2); } - }else { + } else { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(input2.z, weight2, output2); @@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped( } // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, + in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); - if (abs(max_w_bound - pos_in.x) < input_width){ - if (burndary_index==0){ + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(input3.z, weight2, output3); output3 = mad(input3.w, weight3, output3); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(input3.z, weight2, output3); output3 = mad(0.0f, weight3, output3); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(0.0f, weight2, output3); output3 = mad(0.0f, weight3, output3); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output3 = mad(input3.x, weight0, output3); output3 = mad(0.0f, weight1, output3); output3 = mad(0.0f, weight2, output3); output3 = mad(0.0f, weight3, output3); } - }else { + } else { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(input3.z, weight2, output3); @@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped( } #ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); #endif if (out_w0 < old_w) { write_imageh(output_image, output_pos0, output0); } - if (out_w1 < old_w){ + if (out_w1 < old_w) { write_imageh(output_image, output_pos1, output1); } - if (out_w2 < old_w){ + if (out_w2 < old_w) { write_imageh(output_image, output_pos2, output2); } - if (out_w3 < old_w){ + if (out_w3 < old_w) { write_imageh(output_image, output_pos3, output3); } } -__kernel void conv_7x7(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - +__kernel void conv_7x7( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const int filter_n0 = 4 * out_c + 0; - const int filter_n1 = 4 * out_c + 1; - const int filter_n2 = 4 * out_c + 2; - const int filter_n3 = 4 * out_c + 3; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const int filter_n0 = 4 * out_c + 0; + const int filter_n1 = 4 * out_c + 1; + const int filter_n2 = 4 * out_c + 2; + const int filter_n3 = 4 * out_c + 3; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; -#endif - - half4 input; - half4 filter[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for(int j = 0; j < 7; j++){ - for(int k = 0; k < 7; k++){ - input = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15)); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 7 + filter_w; - filter_pos0.y = filter_n0 * 7 + filter_h; - - filter_pos1.x = filter_c * 7 + filter_w; - filter_pos1.y = filter_n1 * 7 + filter_h; - - filter_pos2.x = filter_c * 7 + filter_w; - filter_pos2.y = filter_n2 * 7 + filter_h; - - filter_pos3.x = filter_c * 7 + filter_w; - filter_pos3.y = filter_n3 * 7 + filter_h; - - filter[0] = read_imageh(filter_image, sampler, filter_pos0); - filter[1] = read_imageh(filter_image, sampler, filter_pos1); - filter[2] = read_imageh(filter_image, sampler, filter_pos2); - filter[3] = read_imageh(filter_image, sampler, filter_pos3); - - output.x += dot(input, filter[0]); - output.y += dot(input, filter[1]); - output.z += dot(input, filter[2]); - output.w += dot(input, filter[3]); - } - } + half4 output = 0.0f; +#endif + + half4 input; + half4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for (int j = 0; j < 7; j++) { + for (int k = 0; k < 7; k++) { + input = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 3) * dilation, + pos_in.y + (k - 3) * dilation)), + (half4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 3) * dilation < 0 || + in_pos_in_one_block.y + (k - 3) * dilation < 0 || + in_pos_in_one_block.x + (j - 3) * dilation >= input_width || + in_pos_in_one_block.y + (k - 3) * dilation >= input_height) + << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 7 + filter_w; + filter_pos0.y = filter_n0 * 7 + filter_h; + + filter_pos1.x = filter_c * 7 + filter_w; + filter_pos1.y = filter_n1 * 7 + filter_h; + + filter_pos2.x = filter_c * 7 + filter_w; + filter_pos2.y = filter_n2 * 7 + filter_h; + + filter_pos3.x = filter_c * 7 + filter_w; + filter_pos3.y = filter_n3 * 7 + filter_h; + + filter[0] = read_imageh(filter_image, sampler, filter_pos0); + filter[1] = read_imageh(filter_image, sampler, filter_pos1); + filter[2] = read_imageh(filter_image, sampler, filter_pos2); + filter[3] = read_imageh(filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + } } + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } -__kernel void conv_7x7Pt1x2(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - +__kernel void conv_7x7Pt1x2( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w1 = get_global_id(1); - const int out_nh = get_global_id(2); - - if (out_c >= global_size_dim0 || - out_w1 >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const int out_w = out_w1 * 2; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w1 = get_global_id(1); + const int out_nh = get_global_id(2); - int2 output_pos = (int2)(out_c * output_width + out_w, out_nh); + if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const int out_w = out_w1 * 2; - const int filter_n0 = 4 * out_c + 0; - const int filter_n1 = 4 * out_c + 1; - const int filter_n2 = 4 * out_c + 2; - const int filter_n3 = 4 * out_c + 3; + int2 output_pos = (int2)(out_c * output_width + out_w, out_nh); - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + const int filter_n0 = 4 * out_c + 0; + const int filter_n1 = 4 * out_c + 1; + const int filter_n2 = 4 * out_c + 2; + const int filter_n3 = 4 * out_c + 3; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - half4 output0 = 0.0f; - half4 output1 = 0.0f; + half4 output0 = 0.0f; + half4 output1 = 0.0f; #ifdef BIASE_CH - output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - output1 = output0; + output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + output1 = output0; #elif defined(BIASE_ELE) - output0 = read_imageh(bias, sampler, output_pos); - output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y)); + output0 = read_imageh(bias, sampler, output_pos); + output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y)); #else - output0 = 0.0f; - output1 = 0.0f; -#endif - - half4 input[8]; - half4 filter0[4]; - half4 filter1[4]; - half4 filter2[4]; - half4 filter3[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for(int k = 0; k < 7; k++){ - for (int j = 0; j < 8; j++) { - input[j] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15)); - - int filter_h = k; - int filter_w = j; - int filter_c = i; - - if (j < 7) { - filter_pos0.x = filter_c * 7 + filter_w; - filter_pos0.y = filter_n0 * 7 + filter_h; - - filter_pos1.x = filter_c * 7 + filter_w; - filter_pos1.y = filter_n1 * 7 + filter_h; - - filter_pos2.x = filter_c * 7 + filter_w; - filter_pos2.y = filter_n2 * 7 + filter_h; - - filter_pos3.x = filter_c * 7 + filter_w; - filter_pos3.y = filter_n3 * 7 + filter_h; - - filter0[0] = read_imageh(filter_image, sampler, filter_pos0); - filter0[1] = read_imageh(filter_image, sampler, filter_pos1); - filter0[2] = read_imageh(filter_image, sampler, filter_pos2); - filter0[3] = read_imageh(filter_image, sampler, filter_pos3); - - output0.x += dot(input[j], filter0[0]); - output0.y += dot(input[j], filter0[1]); - output0.z += dot(input[j], filter0[2]); - output0.w += dot(input[j], filter0[3]); - } - - if (j > 0) { - output1.x += dot(input[j], filter1[0]); - output1.y += dot(input[j], filter1[1]); - output1.z += dot(input[j], filter1[2]); - output1.w += dot(input[j], filter1[3]); - } - - filter1[0] = filter0[0]; - filter1[1] = filter0[1]; - filter1[2] = filter0[2]; - filter1[3] = filter0[3]; - } + output0 = 0.0f; + output1 = 0.0f; +#endif + + half4 input[8]; + half4 filter0[4]; + half4 filter1[4]; + half4 filter2[4]; + half4 filter3[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for (int k = 0; k < 7; k++) { + for (int j = 0; j < 8; j++) { + input[j] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 3) * dilation, + pos_in.y + (k - 3) * dilation)), + (half4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 3) * dilation < 0 || + in_pos_in_one_block.y + (k - 3) * dilation < 0 || + in_pos_in_one_block.x + (j - 3) * dilation >= input_width || + in_pos_in_one_block.y + (k - 3) * dilation >= input_height) + << 15)); + + int filter_h = k; + int filter_w = j; + int filter_c = i; + + if (j < 7) { + filter_pos0.x = filter_c * 7 + filter_w; + filter_pos0.y = filter_n0 * 7 + filter_h; + + filter_pos1.x = filter_c * 7 + filter_w; + filter_pos1.y = filter_n1 * 7 + filter_h; + + filter_pos2.x = filter_c * 7 + filter_w; + filter_pos2.y = filter_n2 * 7 + filter_h; + + filter_pos3.x = filter_c * 7 + filter_w; + filter_pos3.y = filter_n3 * 7 + filter_h; + + filter0[0] = read_imageh(filter_image, sampler, filter_pos0); + filter0[1] = read_imageh(filter_image, sampler, filter_pos1); + filter0[2] = read_imageh(filter_image, sampler, filter_pos2); + filter0[3] = read_imageh(filter_image, sampler, filter_pos3); + + output0.x += dot(input[j], filter0[0]); + output0.y += dot(input[j], filter0[1]); + output0.z += dot(input[j], filter0[2]); + output0.w += dot(input[j], filter0[3]); + } + + if (j > 0) { + output1.x += dot(input[j], filter1[0]); + output1.y += dot(input[j], filter1[1]); + output1.z += dot(input[j], filter1[2]); + output1.w += dot(input[j], filter1[3]); } - } + + filter1[0] = filter0[0]; + filter1[1] = filter0[1]; + filter1[2] = filter0[2]; + filter1[3] = filter0[3]; + } + } + } #ifdef BATCH_NORM - half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0)); - half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output0 = output0 * s + b; - output1 = output1 * s + b; + half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0)); + half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output0 = output0 * s + b; + output1 = output1 * s + b; #endif #ifdef RELU - output0 = activation(output0); - output1 = activation(output1); + output0 = activation(output0); + output1 = activation(output1); #endif - write_imageh(output_image, output_pos, output0); - if ((output_pos.x + 1) % output_width != 0) { - write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1); - } + write_imageh(output_image, output_pos, output0); + if ((output_pos.x + 1) % output_width != 0) { + write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1); + } } // dilation == 1 -__kernel void conv_7x7spl(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, +__kernel void conv_7x7spl( + __private const int item_ch, __private const int item_w, + __private const int item_h, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h) { - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - // filter - const int filter_w = 7; - const int filter_h = 7; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int pad, __private const int dilation, + __private const int in_ch, __private const int in_w, + __private const int in_h, __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; #ifdef BIASE_CH - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; #elif defined(BIASE_ELE) - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); - } + half4 output[5]; + output[0] = + read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } #else - half4 output[5] = {0.0f}; -#endif - - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - half4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * filter_h; - int filter_h_val1 = filter_h_val0 + filter_h; - int filter_h_val2 = filter_h_val1 + filter_h; - int filter_h_val3 = filter_h_val2 + filter_h; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * filter_w; - - for (int h = 0; h < filter_h; h++) { - - int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, - (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); - - for (int w = 0; w < filter_w; w++) { - - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); - input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); - input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); - input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); - input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } + half4 output[5] = {0.0f}; +#endif + + half4 filter[4] = {0.0f}; + half4 filter_trans[4] = {0.0f}; + half4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, + (out_batch_id * in_h + in_h_id + h < 0 || + out_batch_id * in_h + in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = + read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = + read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = + read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = + read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = + read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } } + } #ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output[0] = mad(scale, output[0], biase); - if (out_w_id1 < out_w) { - output[1] = mad(scale, output[1], biase); - } - if (out_w_id2 < out_w) { - output[2] = mad(scale, output[2], biase); - } - if (out_w_id3 < out_w) { - output[3] = mad(scale, output[3], biase); - } - if (out_w_id4 < out_w) { - output[4] = mad(scale, output[4], biase); - } + half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); + output[0] = mad(scale, output[0], biase); + if (out_w_id1 < out_w) { + output[1] = mad(scale, output[1], biase); + } + if (out_w_id2 < out_w) { + output[2] = mad(scale, output[2], biase); + } + if (out_w_id3 < out_w) { + output[3] = mad(scale, output[3], biase); + } + if (out_w_id4 < out_w) { + output[4] = mad(scale, output[4], biase); + } #endif #ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); - output[2] = activation(output[2]); - output[3] = activation(output[3]); - output[4] = activation(output[4]); -#endif - write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); - if (out_w_id1 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); - } - if (out_w_id2 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); - } - if (out_w_id3 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); - } - if (out_w_id4 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); - } + output[0] = activation(output[0]); + output[1] = activation(output[1]); + output[2] = activation(output[2]); + output[3] = activation(output[3]); + output[4] = activation(output[4]); +#endif + write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } } -__kernel void conv_5x5(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - +__kernel void conv_5x5( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const filter_n0 = 4 * out_c + 0; - const filter_n1 = 4 * out_c + 1; - const filter_n2 = 4 * out_c + 2; - const filter_n3 = 4 * out_c + 3; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const filter_n0 = 4 * out_c + 0; + const filter_n1 = 4 * out_c + 1; + const filter_n2 = 4 * out_c + 2; + const filter_n3 = 4 * out_c + 3; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; -#endif - - half4 input; - half4 filter[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for(int j = 0; j < 5; j++){ - for(int k = 0; k < 5; k++){ - input = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 2) * dilation, pos_in.y + (k - 2) * dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15)); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 5 + filter_w; - filter_pos0.y = filter_n0 * 5 + filter_h; - - filter_pos1.x = filter_c * 5 + filter_w; - filter_pos1.y = filter_n1 * 5 + filter_h; - - filter_pos2.x = filter_c * 5 + filter_w; - filter_pos2.y = filter_n2 * 5 + filter_h; - - filter_pos3.x = filter_c * 5 + filter_w; - filter_pos3.y = filter_n3 * 5 + filter_h; - - filter[0] = read_imageh(filter_image, sampler, filter_pos0); - filter[1] = read_imageh(filter_image, sampler, filter_pos1); - filter[2] = read_imageh(filter_image, sampler, filter_pos2); - filter[3] = read_imageh(filter_image, sampler, filter_pos3); - - output.x += dot(input, filter[0]); - output.y += dot(input, filter[1]); - output.z += dot(input, filter[2]); - output.w += dot(input, filter[3]); - } - } + half4 output = 0.0f; +#endif + + half4 input; + half4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for (int j = 0; j < 5; j++) { + for (int k = 0; k < 5; k++) { + input = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 2) * dilation, + pos_in.y + (k - 2) * dilation)), + (half4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 2) * dilation < 0 || + in_pos_in_one_block.y + (k - 2) * dilation < 0 || + in_pos_in_one_block.x + (j - 2) * dilation >= input_width || + in_pos_in_one_block.y + (k - 2) * dilation >= input_height) + << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 5 + filter_w; + filter_pos0.y = filter_n0 * 5 + filter_h; + + filter_pos1.x = filter_c * 5 + filter_w; + filter_pos1.y = filter_n1 * 5 + filter_h; + + filter_pos2.x = filter_c * 5 + filter_w; + filter_pos2.y = filter_n2 * 5 + filter_h; + + filter_pos3.x = filter_c * 5 + filter_w; + filter_pos3.y = filter_n3 * 5 + filter_h; + + filter[0] = read_imageh(filter_image, sampler, filter_pos0); + filter[1] = read_imageh(filter_image, sampler, filter_pos1); + filter[2] = read_imageh(filter_image, sampler, filter_pos2); + filter[3] = read_imageh(filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + } } + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } -__kernel void convBNAdd_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, - +__kernel void convBNAdd_3x3( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; - - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { - half4 output = (half4)0.0f; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - half4 input[9]; + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - input[0] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } - input[1] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; - input[2] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - input[3] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - input[4] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - input[5] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + half4 output = (half4)0.0f; - input[6] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + half4 input[9]; - input[7] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[1] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[2] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[3] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[4] = select( + read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[5] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[6] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[7] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[8] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); - input[8] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + /* + for (int j = 0; j < 9; ++j) { + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); -/* - for (int j = 0; j < 9; ++j) { - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - } -*/ - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); - } + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + */ + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef BIASE_CH - output += read_imageh(bias, sampler, (int2)(out_c, 0)); + output += read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - output += read_imageh(bias, sampler, output_pos); + output += read_imageh(bias, sampler, output_pos); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } -__kernel void convBNAdd_1x1(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void convBNAdd_1x1( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const uint kernelHXW = 1; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); - + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); half4 output = 0.0f; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - half4 input = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); -/* - output.x = dot(input, weight0); - output.y = dot(input, weight1); - output.z = dot(input, weight2); - output.w = dot(input, weight3); -*/ + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); - output = mad(input.x, weight0, output); - output = mad(input.y, weight1, output); - output = mad(input.z, weight2, output); - output = mad(input.w, weight3, output); + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + /* + output.x = dot(input, weight0); + output.y = dot(input, weight1); + output.z = dot(input, weight2); + output.w = dot(input, weight3); + */ - } + output = mad(input.x, weight0, output); + output = mad(input.y, weight1, output); + output = mad(input.z, weight2, output); + output = mad(input.w, weight3, output); + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef BIASE_CH - output += read_imageh(bias, sampler, (int2)(out_c, 0)); + output += read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - output += read_imageh(bias, sampler, output_pos); + output += read_imageh(bias, sampler, output_pos); #endif #ifdef RELU @@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0, } __kernel void convBNAdd_1x1_spl( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w -) { + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height, + __private const int old_w) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl( int out_w2 = out_w + global_size_dim1 * 2; int out_w3 = out_w + global_size_dim1 * 3; - int outpos_main = mul24(out_c , old_w); + int outpos_main = mul24(out_c, old_w); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); int2 in_pos_in_one_block0 = - ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); + ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); int2 in_pos_in_one_block1 = - ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); + ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); int2 in_pos_in_one_block2 = - ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); + ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); int2 in_pos_in_one_block3 = - ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); - + ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); half4 output0 = 0.0f; half4 output1 = 0.0f; @@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl( for (int i = 0; i < input_c; ++i) { // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, + in_pos_in_one_block0.y); half4 input0 = read_imageh(input_image, sampler, pos_in); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); @@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl( output0 = mad(input0.w, weight3, output0); // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, + in_pos_in_one_block1.y); half4 input1 = read_imageh(input_image, sampler, pos_in); // // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + @@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl( output1 = mad(input1.w, weight3, output1); // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, + in_pos_in_one_block2.y); half4 input2 = read_imageh(input_image, sampler, pos_in); // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + @@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl( output2 = mad(input2.w, weight3, output2); // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, + in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + @@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl( } #ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef BIASE_CH - output0 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output1 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output2 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output3 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output0 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output1 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output2 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output3 += read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - output0 += read_imageh(bias, sampler, output_pos0); - output1 += read_imageh(bias, sampler, output_pos1); - output2 += read_imageh(bias, sampler, output_pos2); - output3 += read_imageh(bias, sampler, output_pos3); + output0 += read_imageh(bias, sampler, output_pos0); + output1 += read_imageh(bias, sampler, output_pos1); + output2 += read_imageh(bias, sampler, output_pos2); + output3 += read_imageh(bias, sampler, output_pos3); #endif #ifdef RELU @@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl( write_imageh(output_image, output_pos0, output0); } - if (out_w1 < old_w){ + if (out_w1 < old_w) { write_imageh(output_image, output_pos1, output1); } - if (out_w2 < old_w){ + if (out_w2 < old_w) { write_imageh(output_image, output_pos2, output2); } - if (out_w3 < old_w){ + if (out_w3 < old_w) { write_imageh(output_image, output_pos3, output3); } } +__kernel void depth_conv( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input, + __read_only image2d_t filter, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height, + __private const int filter_width, __private const int filter_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + const int batch_index = out_nh / output_height; + const int out_nh_in_one_batch = out_nh % output_height; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); +#ifdef BIASE_CH + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#elif defined(BIASE_ELE) + half4 output = read_imageh(bias, sampler, output_pos); +#else + half4 output = 0.0f; +#endif + int2 pos_in_input_block = + (int2)(out_c * input_width, batch_index * input_height); + int2 pos_in_filter_block = + (int2)(out_c * filter_width, batch_index * filter_height); + int filter_x = pos_in_filter_block.x; + int filter_y = pos_in_filter_block.y; + int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x; + int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y; + int2 align = {filter_width / 2, filter_height / 2}; + /* if (output_pos.x == 0 && output_pos.y == 0){ + printf("align.x=%d align.y=%d \n ",align.x,align.y); + printf("stride=%d \n ",stride); + }*/ + for (int fy = 0; fy < filter_height; ++fy) { + for (int fx = 0; fx < filter_width; ++fx) { + int x_off = fx - align.x; + int y_off = fy - align.y; + /* if (output_pos.x == 0 && output_pos.y == 0){ + printf("fx=%d fy=%d \n ",fx,fy); + printf("x_off=%d y_off=%d \n ",x_off,y_off); + }*/ + half4 in = select( + read_imageh(input, sampler, + (int2)(input_x_base + x_off, input_y_base + y_off)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + x_off < 0 || + in_pos_in_one_block.y + y_off < 0 || + in_pos_in_one_block.x + x_off >= input_width || + in_pos_in_one_block.y + y_off >= input_height) + << 15)); + half4 f = + read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy)); + output += in * f; + /*if (output_pos.x ==111 && output_pos.y == 0){ + printf("in={ %f , %f , %f , %f } \n + ",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w)); + printf("filter={ %f , %f , %f , %f } \n + ",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w)); + printf("output={ %f , %f , %f , %f } \n + ",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w)); + }*/ + } + } +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif - - +#ifdef RELU + output = activation(output); +#endif + write_imageh(output_image, output_pos, output); +} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl index b975eb405633b3d7252aea30671818066459b3ea..4895c07d201283d2b82e52209baf2baa896bc329 100644 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl @@ -13,33 +13,101 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords); - half4 output = in * biase; - write_imageh(outputImage,coords,output); - } - - -__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only -image2d_t outputImage, int w) { +__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage) { int x = get_global_id(0); int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords); + half4 output = in * biase; + write_imageh(outputImage, coords, output); +} + +__kernel void channel_mul(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 coords; coords.x = x; coords.y = y; int2 coords_bias; - coords_bias.x = x/w; + coords_bias.x = x / w; coords_bias.y = 0; half4 in = read_imageh(input, sampler, coords); half4 biase = read_imageh(bias, sampler, coords_bias); half4 output = in * biase; - write_imageh(outputImage,coords,output); + write_imageh(outputImage, coords, output); } + +// etc : 1 1 1 72 +// run time Y [value,0,0,0] * 72 +__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + + int2 coords_bias0; + int2 coords_bias1; + int2 coords_bias2; + int2 coords_bias3; + + /* if (x == 0 && y == 0) { + half4 b = (half4){0, 0, 0, 0}; + #define PPI(j, k) \ + b = read_imageh(bias, sampler, (int2){j, k}); \ + printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \ + convert_float(b.y), convert_float(b.z), convert_float(b.w)); + for (int i = 0; i < 73; ++i) { + PPI(i, 0); + } + #undef PPI + }*/ + + coords_bias0.x = x / w * 4; + coords_bias0.y = 0; + + coords_bias1.x = x / w * 4 + 1; + coords_bias1.y = 0; + + coords_bias2.x = x / w * 4 + 2; + coords_bias2.y = 0; + + coords_bias3.x = x / w * 4 + 3; + coords_bias3.y = 0; + + half4 biase0 = read_imageh(bias, sampler, coords_bias0); + half4 biase1 = read_imageh(bias, sampler, coords_bias1); + half4 biase2 = read_imageh(bias, sampler, coords_bias2); + half4 biase3 = read_imageh(bias, sampler, coords_bias3); + /* if (x == 0 && y == 0) { + printf("bias0={ %f , %f , %f , %f }\n ", + convert_float(biase0.x), convert_float(biase0.y), + convert_float(biase0.z), convert_float(biase0.w)); + + printf("bias1={ %f , %f , %f , %f }\n ", + convert_float(biase1.x), convert_float(biase1.y), + convert_float(biase1.z), convert_float(biase1.w)); + printf("bias2={ %f , %f , %f , %f }\n ", + convert_float(biase2.x), convert_float(biase2.y), + convert_float(biase2.z), convert_float(biase2.w)); + printf("bias3={ %f , %f , %f , %f }\n ", + convert_float(biase3.x), convert_float(biase3.y), + convert_float(biase3.z), convert_float(biase3.w)); + }*/ + half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x}; + half4 in = read_imageh(input, sampler, coords); + half4 output = mad(in, biase, 0); + write_imageh(outputImage, coords, output); +} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index a3a469dc8699fc0b185794d681cacb27d9f352ec..1772cd275b77901b2dfa389fec1c521cdfc85bac 100644 --- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -174,6 +174,16 @@ bool ConvAddBNReluKernel::Init( build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + // other depthwise not with filter 3x3 + DLOG << "depth_conv basic "; + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); + } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -214,6 +224,7 @@ void ConvAddBNReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(), param.NewScale(), param.NewBias()); break; diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp index a0e890a70b31e36c1743ae35b54fc5cb0446a8b3..94ffc001b4cbba7dc31f5073612cc01b47b7ec5c 100644 --- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp @@ -71,6 +71,14 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); + } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -124,6 +132,7 @@ void ConvAddKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); break; case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp index 77738fe34c0f53816fcf726c0b2bc2f1c13a9010..370934849c08bca2a27411ea80468ec829e064ca 100644 --- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -72,6 +72,14 @@ bool ConvAddReluKernel::Init( build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + DLOG << "init depwise conv basic"; + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -130,6 +138,7 @@ void ConvAddReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias()); break; case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp index c8cb97c2e2b6c23dbc0558593bb6200b286a63e2..02fdfb782e8e052ed3d4206e886bb2d50944a68f 100644 --- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp @@ -129,6 +129,14 @@ bool ConvBNReluKernel::Init( build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -168,6 +176,7 @@ void ConvBNReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(), param.NewBias()); break; diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp index 2859715b9c9ff9f7653849dcd952ed2d148e2f53..0965e5feb200a0c0d4f3489d0e241eb043e7f93f 100644 --- a/mobile/src/operators/kernel/cl/conv_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp @@ -66,6 +66,14 @@ bool ConvKernel::Init(ConvParam *param) { } DLOG << "depth_conv 3x3"; + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -115,6 +123,7 @@ void ConvKernel::Compute(const ConvParam ¶m) { case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param); break; case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp index 0e63ccb095667529feb5dc28344bd54fbbd5b7cb..ecfc5fbd10bd7ff027d2d731805d63fc86821837 100644 --- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp @@ -72,6 +72,14 @@ bool ConvReluKernel::Init(FusionConvReluParam *param) { DLOG << "depth_conv 3x3"; + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -120,6 +128,7 @@ void ConvReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true); break; case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp index 9f2aca78509ea45525f1dcd39a7a8154ca75060e..fd5b9e6bc3ffcce5ddde03e575cec0d1649758fc 100644 --- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp +++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp @@ -15,6 +15,8 @@ limitations under the License. */ #ifdef ELEMENTWISEMUL_OP #include "operators/kernel/elementwise_mul_kernel.h" +#include +#include #include "framework/cl/cl_image.h" namespace paddle_mobile { @@ -23,19 +25,24 @@ namespace operators { template <> bool ElementwiseMulKernel::Init( ElementwiseMulParam *param) { - DLOG << "-----init add-----"; framework::CLImage *bias = reinterpret_cast( const_cast(param->InputY())); if (bias->dims() == param->InputX()->dims()) { + DLOG << "init element wise mul"; this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 4) { + } else if (bias->dims().size() == 1) { + DLOG << "init channel_mul"; this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); + } else if (bias->dims().size() == 2) { + // etc. input 1 72 28 28 + // filter 1 72 + DLOG << "init channel_mul_d2"; + this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); } else { - DLOG << "error:bias dims is error"; + PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet"); } return true; } - template <> void ElementwiseMulKernel::Compute( const ElementwiseMulParam ¶m) { @@ -64,8 +71,8 @@ void ElementwiseMulKernel::Compute( clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 4) { - DLOG << "zp7 444"; + } else if (bias->dims().size() == 1) { + DLOG << "channel mul"; cl_mem input_image = input->GetCLImage(); cl_mem bias_image = bias->GetCLImage(); cl_mem output_image = output->GetCLImage(); @@ -84,14 +91,48 @@ void ElementwiseMulKernel::Compute( CL_CHECK_ERRORS(status); auto width = input->ImageWidth(); auto height = input->ImageHeight(); - DLOG << "dede:" << width << "," << height; size_t global_work_size[2] = {width, height}; status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); + } else if (bias->dims().size() == 2) { + DLOG << "channel mul d2"; + + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); + + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + + // bias->PrintTensor(*bias); } else { - DLOG << "error:bias dims is error"; + PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") } } diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h index 0415291a73eae356b0c25e1b98de471246e0e6c2..e58159fbb74e7a91a88c3e76f8aa713b679d94b8 100644 --- a/mobile/src/operators/op_param.h +++ b/mobile/src/operators/op_param.h @@ -489,6 +489,7 @@ class ConvParam : public OpParam { EXEC_SLIDINGWINDOW5x5_FLOAT, EXEC_SLIDINGWINDOW7x7_FLOAT, EXEC_GEMM1x1s1_FLOAT, + EXEC_DEPTHWISEBASIC_FLOAT, }; ExecMode &ExecMode() const { return exec_mode_; } diff --git a/mobile/test/net/test_net_multi_feed.cpp b/mobile/test/net/test_net_multi_feed.cpp index e15b37d92655f45bfcff4ff0b4b2ff2c3c2d9ae9..5c04a76ad31928a1f89cfaa35b708b5291401481 100644 --- a/mobile/test/net/test_net_multi_feed.cpp +++ b/mobile/test/net/test_net_multi_feed.cpp @@ -216,4 +216,6 @@ void test(int argc, char *argv[]) { std::cout << std::endl; } } +#else +int main() {} #endif