提交 6554854a 编写于 作者: L Liu Yiqun

Merge branch 'develop' into step_rnn/opt_ddim_lite

test=develop
......@@ -120,6 +120,7 @@
#
## Lite settings
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto")
if (ARM_TARGET_OS STREQUAL "ios")
set(PLATFORM "OS")
elseif(ARM_TARGET_OS STREQUAL "ios64")
......
......@@ -305,6 +305,26 @@ if(NOT IOS)
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
#include <thread> // NOLINT
using paddle::lite::profile::Timer;
DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_string(model_dir_0, "", "model_dir_0");
DEFINE_string(input_shape_0,
"1,3,224,224",
"input shapes another, separated by colon and comma");
DEFINE_bool(use_optimize_nb,
false,
"optimized & naive buffer model for mobile devices");
DEFINE_int32(test_type, 0, "multithread test type");
namespace paddle {
namespace lite_api {
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model
int ret = system(
paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
.c_str());
if (ret == 0) {
LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load model from " << load_model_dir;
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const PowerMode power_mode,
const int thread_num,
const int repeat,
int tid,
const int warmup_times = 5) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
}
}
for (int i = 0; i < warmup_times; ++i) {
predictor->Run();
}
Timer ti;
for (int j = 0; j < repeat; ++j) {
ti.Start();
predictor->Run();
float t = ti.Stop();
auto output = predictor->GetOutput(0);
auto out = output->data<float>();
LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
<< " output[0]:" << out[0] << "; output[1]:" << out[1];
}
LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num
<< ", avg time: " << ti.LapTimes().Avg() << "ms"
<< ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.LapTimes().Max() << " ms.";
}
void RunTestType_00(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const PowerMode power_mode,
const int thread_num,
const int repeat,
const int warmup_times = 5) {
std::thread run_th0(Run,
input_shapes,
model_dir,
power_mode,
thread_num,
repeat,
0,
warmup_times);
Run(input_shapes, model_dir, power_mode, thread_num, repeat, 1, warmup_times);
run_th0.join();
}
void RunTestType_01(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const std::vector<std::vector<int64_t>>& input_shapes_0,
const std::string& model_dir_0,
const PowerMode power_mode,
const int thread_num,
const int repeat,
const int warmup_times = 5) {
std::thread run_th0(Run,
input_shapes,
model_dir,
power_mode,
thread_num,
repeat,
0,
warmup_times);
Run(input_shapes_0,
model_dir_0,
power_mode,
thread_num,
repeat,
1,
warmup_times);
run_th0.join();
}
void run_with_predictor(std::shared_ptr<PaddlePredictor> predictor,
const std::vector<std::vector<int64_t>>& input_shapes,
int index,
const std::string& name) {
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
}
}
Timer ti;
ti.Start();
predictor->Run();
float t = ti.Stop();
auto output = predictor->GetOutput(0);
auto out = output->data<float>();
LOG(INFO) << "[thread " << index << "] name: " << name
<< ",run time: " << ti.LapTimes().Avg() << "ms"
<< " output[0]:" << out[0] << "; output[1]:" << out[1];
}
void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const PowerMode power_mode,
const int thread_num,
const int repeat,
int warmup = 5) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
for (int i = 0; i < repeat; ++i) {
std::thread pre_th0(
run_with_predictor, predictor, input_shapes, i, model_dir);
pre_th0.join();
}
}
void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const std::vector<std::vector<int64_t>>& input_shapes_0,
const std::string& model_dir_0,
const PowerMode power_mode,
const int thread_num,
const int repeat,
int warmup = 5) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
config.set_model_dir(model_dir_0);
auto predictor_0 = lite_api::CreatePaddlePredictor(config);
for (int i = 0; i < 2 * repeat; i += 2) {
std::thread pre_th0(
run_with_predictor, predictor, input_shapes, i, model_dir);
std::thread pre_th1(
run_with_predictor, predictor_0, input_shapes_0, i + 1, model_dir_0);
pre_th0.join();
pre_th1.join();
}
}
#endif
} // namespace lite_api
} // namespace paddle
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model";
exit(0);
}
std::string save_optimized_model_dir = "";
std::string save_optimized_model_dir_0 = "";
if (FLAGS_use_optimize_nb) {
save_optimized_model_dir = FLAGS_model_dir;
save_optimized_model_dir_0 = FLAGS_model_dir_0;
} else {
save_optimized_model_dir = FLAGS_model_dir + "opt2";
save_optimized_model_dir_0 = FLAGS_model_dir_0 + "opt2";
}
auto split_string =
[](const std::string& str_in) -> std::vector<std::string> {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
};
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
while (!tmp_str.empty()) {
int dim = atoi(tmp_str.data());
shape.push_back(dim);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return shape;
};
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) {
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
std::vector<std::string> str_input_shapes_0 =
split_string(FLAGS_input_shape_0);
std::vector<std::vector<int64_t>> input_shapes_0;
for (int i = 0; i < str_input_shapes_0.size(); ++i) {
input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
}
if (!FLAGS_use_optimize_nb) {
// Output optimized model
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
paddle::lite_api::OutputOptModel(
FLAGS_model_dir_0, save_optimized_model_dir_0, input_shapes_0);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
if (FLAGS_test_type == 0) {
paddle::lite_api::RunTestType_00(
input_shapes,
save_optimized_model_dir,
static_cast<paddle::lite_api::PowerMode>(0),
FLAGS_threads,
FLAGS_repeats,
5);
LOG(INFO) << "=========above is case 0, below is case "
"1============================";
paddle::lite_api::RunTestType_10(
input_shapes,
save_optimized_model_dir,
static_cast<paddle::lite_api::PowerMode>(0),
FLAGS_threads,
FLAGS_repeats);
}
if (FLAGS_test_type == 1) {
paddle::lite_api::RunTestType_01(
input_shapes,
save_optimized_model_dir,
input_shapes_0,
save_optimized_model_dir_0,
static_cast<paddle::lite_api::PowerMode>(0),
FLAGS_threads,
FLAGS_repeats,
5);
LOG(INFO) << "=========above is case 0, below is case "
"1============================";
paddle::lite_api::RunTestType_11(
input_shapes,
save_optimized_model_dir,
input_shapes_0,
save_optimized_model_dir_0,
static_cast<paddle::lite_api::PowerMode>(0),
FLAGS_threads,
FLAGS_repeats);
}
#endif
return 0;
}
......@@ -32,26 +32,37 @@
#include <gflags/gflags.h>
#include <algorithm>
DEFINE_double(fraction_of_cpu_memory_to_use,
1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
DEFINE_uint64(initial_cpu_memory_in_mb,
500ul,
"Initial CPU memory for PaddlePaddle, in MD unit.");
DEFINE_double(
fraction_of_cuda_pinned_memory_to_use,
0.5,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
#include "lite/utils/env.h"
// DEFINE_double(fraction_of_cpu_memory_to_use,
// 1,
// "Default use 100% of CPU memory for PaddlePaddle,"
// "reserve the rest for page tables, etc");
double fraction_of_cpu_memory_to_use =
paddle::lite::GetDoubleFromEnv("fraction_of_cpu_memory_to_use", 1);
// DEFINE_uint64(initial_cpu_memory_in_mb,
// 500ul,
// "Initial CPU memory for PaddlePaddle, in MD unit.");
uint64_t initial_cpu_memory_in_mb =
paddle::lite::GetUInt64FromEnv("initial_cpu_memory_in_mb", 500ul);
// DEFINE_double(
// fraction_of_cuda_pinned_memory_to_use,
// 0.5,
// "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
// "reserve the rest for page tables, etc");
double fraction_of_cuda_pinned_memory_to_use = paddle::lite::GetDoubleFromEnv(
"fraction_of_cuda_pinned_memory_to_use", 0.5);
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
bool use_pinned_memory =
paddle::lite::GetBoolFromEnv("use_pinned_memory", true);
namespace paddle {
namespace lite {
......@@ -81,7 +92,7 @@ size_t CpuTotalPhysicalMemory() {
size_t CpuMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
return fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
}
size_t CpuMinChunkSize() {
......@@ -92,15 +103,14 @@ size_t CpuMinChunkSize() {
size_t CpuMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
// or the initial_cpu_memory_in_mb.
return std::min(
static_cast<size_t>(CpuMaxAllocSize() / 32),
static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
return std::min(static_cast<size_t>(CpuMaxAllocSize() / 32),
static_cast<size_t>(initial_cpu_memory_in_mb * 1 << 20));
}
size_t CUDAPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
return fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
}
size_t CUDAPinnedMinChunkSize() {
......
......@@ -22,36 +22,46 @@ limitations under the License. */
#include "lite/backends/x86/cupti_lib_path.h"
#include "lite/backends/x86/port.h"
#include "lite/backends/x86/warpctc_lib_path.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
DEFINE_string(cudnn_dir,
"",
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
// DEFINE_string(cudnn_dir,
// "",
// "Specify path for loading libcudnn.so. For instance, "
// "/usr/local/cudnn/lib. If empty [default], dlopen "
// "will search cudnn from LD_LIBRARY_PATH");
std::string cudnn_dir = paddle::lite::GetStringFromEnv("cudnn_dir"); // NOLINT
DEFINE_string(cuda_dir,
"",
"Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
// DEFINE_string(cuda_dir,
// "",
// "Specify path for loading cuda library, such as libcublas, "
// "libcurand. For instance, /usr/local/cuda/lib64. If default, "
// "dlopen will search cuda from LD_LIBRARY_PATH");
std::string cuda_dir = paddle::lite::GetStringFromEnv("cuda_dir"); // NOLINT
DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
std::string f_warpctc_dir = // NOLINT
paddle::lite::GetStringFromEnv("warpctc_dir"); // NOLINT
DEFINE_string(nccl_dir,
"",
"Specify path for loading nccl library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
// DEFINE_string(nccl_dir,
// "",
// "Specify path for loading nccl library, such as libcublas, "
// "libcurand. For instance, /usr/local/cuda/lib64. If default, "
// "dlopen will search cuda from LD_LIBRARY_PATH");
std::string nccl_dir = paddle::lite::GetStringFromEnv("nccl_dir"); // NOLINT
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
// DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
std::string cupti_dir = paddle::lite::GetStringFromEnv("cupti_dir"); // NOLINT
DEFINE_string(
tensorrt_dir,
"",
"Specify path for loading tensorrt library, such as libnvinfer.so.");
// DEFINE_string(
// tensorrt_dir,
// "",
// "Specify path for loading tensorrt library, such as libnvinfer.so.");
std::string tensorrt_dir = // NOLINT
paddle::lite::GetStringFromEnv("tensorrt_dir"); // NOLINT
DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
// DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
std::string mklml_dir = paddle::lite::GetStringFromEnv("mklml_dir"); // NOLINT
namespace paddle {
namespace lite {
......@@ -180,28 +190,28 @@ auto error_msg =
void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
return GetDsoHandleFromSearchPath(cuda_dir, win_cublas_lib);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.so");
#endif
}
void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.dylib", false);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
return GetDsoHandleFromSearchPath(cudnn_dir, win_cudnn_lib);
#else
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.so", false);
#endif
}
void* GetCUPTIDsoHandle() {
std::string cupti_path = cupti_lib_path;
if (!FLAGS_cupti_dir.empty()) {
cupti_path = FLAGS_cupti_dir;
if (!cupti_dir.empty()) {
cupti_path = cupti_dir;
}
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
......@@ -212,18 +222,18 @@ void* GetCUPTIDsoHandle() {
void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
return GetDsoHandleFromSearchPath(cuda_dir, win_curand_lib);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.so");
#endif
}
void* GetWarpCTCDsoHandle() {
std::string warpctc_dir = warpctc_lib_path;
if (!FLAGS_warpctc_dir.empty()) {
warpctc_dir = FLAGS_warpctc_dir;
if (!f_warpctc_dir.empty()) {
warpctc_dir = f_warpctc_dir;
}
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
......@@ -236,27 +246,27 @@ void* GetWarpCTCDsoHandle() {
void* GetNCCLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.so");
#endif
}
void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.so");
#endif
}
void* GetMKLMLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.so");
#endif
}
......
......@@ -21,13 +21,15 @@
// posix_memalign
#include "lite/backends/x86/cpu_info.h"
#include "lite/backends/x86/jit/macro.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
#ifndef _WIN32
#define posix_memalign_free free
#endif
DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
namespace paddle {
namespace lite {
......
......@@ -20,7 +20,8 @@
#include <vector>
#include "lite/backends/x86/jit/kernel_base.h"
DECLARE_bool(dump_jitcode);
// DECLARE_bool(dump_jitcode);
extern bool dump_jitcode;
namespace paddle {
namespace lite {
......@@ -36,7 +37,7 @@ class GenBase : public Kernel {
template <typename Func>
Func getCode() const {
const unsigned char* code = this->getCodeInternal();
if (FLAGS_dump_jitcode) {
if (dump_jitcode) {
this->dumpCode(code);
}
// Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
......
......@@ -86,7 +86,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
// selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
// auto *selected_scores_data =
// selected_scores->mutable_data<float>(dims, platform::CPUPlace());
parent_idx->Resize({static_cast<int64_t>(num_instances)});
parent_idx->Resize(
std::vector<int64_t>({static_cast<int64_t>(num_instances)}));
auto *parent_idx_data =
parent_idx ? parent_idx->mutable_data<int>(TARGET(kX86)) : nullptr;
// auto *parent_idx_data =
......
......@@ -41,9 +41,11 @@
(this is the zlib license)
*/
#pragma once
#include "lite/backends/x86/cpu_info.h"
namespace paddle {
namespace lite {
/* __m128 is ugly to write */
typedef __m256 v8sf; // vector of 8 float (avx)
typedef __m256i v8si; // vector of 8 int (avx)
......@@ -134,7 +136,7 @@ typedef union imm_xmm_union {
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 bitshift ops"
// #warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)
......@@ -152,7 +154,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 integer ops"
// #warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2(and_si128)
AVX2_INTOP_USING_SSE2(andnot_si128)
AVX2_INTOP_USING_SSE2(cmpeq_epi32)
......@@ -175,23 +177,23 @@ AVX2_INTOP_USING_SSE2(add_epi32)
*/
v8sf log256_ps(v8sf x) {
v8si imm0;
v8sf one = *(v8sf *)_ps256_1;
v8sf one = *(v8sf *)_ps256_1; // NOLINT
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
x = _mm256_max_ps(
x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
x = _mm256_max_ps(x, *(v8sf *)_ps256_min_norm_pos); // NOLINT
/* cut off denormalized stuff */ // NOLINT
// can be done with AVX2
imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
/* keep only the fractional part */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); // NOLINT
x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); // NOLINT
// this is again another AVX2 instruction
imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT
v8sf e = _mm256_cvtepi32_ps(imm0);
e = _mm256_add_ps(e, one);
......@@ -203,7 +205,8 @@ v8sf log256_ps(v8sf x) {
} else { x = x - 1.0; }
*/
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
v8sf mask =
_mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); // NOLINT
v8sf tmp = _mm256_and_ps(x, mask);
x = _mm256_sub_ps(x, one);
e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
......@@ -211,34 +214,34 @@ v8sf log256_ps(v8sf x) {
v8sf z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_log_p0;
v8sf y = *(v8sf *)_ps256_cephes_log_p0; // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_mul_ps(y, z);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); // NOLINT
y = _mm256_add_ps(y, tmp);
tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); // NOLINT
x = _mm256_add_ps(x, y);
x = _mm256_add_ps(x, tmp);
x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
......@@ -262,14 +265,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf exp256_ps(v8sf x) {
v8sf tmp = _mm256_setzero_ps(), fx;
v8si imm0;
v8sf one = *(v8sf *)_ps256_1;
v8sf one = *(v8sf *)_ps256_1; // NOLINT
x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); // NOLINT
x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); // NOLINT
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); // NOLINT
fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); // NOLINT
/* how to perform a floorf with SSE: just below */
// imm0 = _mm256_cvttps_epi32(fx);
......@@ -283,24 +286,24 @@ v8sf exp256_ps(v8sf x) {
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); // NOLINT
v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); // NOLINT
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
v8sf y = *(v8sf *)_ps256_cephes_exp_p0; // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
......@@ -308,7 +311,7 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
// another two AVX2 instructions
imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 23);
v8sf pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
......@@ -349,12 +352,12 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT
/* extract the sign bit (upper one) */
sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); // NOLINT
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT
/*
Here we start a series of integer operations, which are in the
......@@ -367,12 +370,12 @@ v8sf sin256_ps(v8sf x) { // any x
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT
y = _mm256_cvtepi32_ps(imm2);
/* get the swap sign flag */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
......@@ -380,31 +383,31 @@ v8sf sin256_ps(v8sf x) { // any x
Both branches will be computed.
*/
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
......@@ -418,9 +421,9 @@ v8sf sin256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
......@@ -429,26 +432,26 @@ v8sf sin256_ps(v8sf x) { // any x
x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0;
y = *(v8sf *)_ps256_coscof_p0; // NOLINT
v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
......@@ -475,53 +478,53 @@ v8sf cos256_ps(v8sf x) { // any x
#endif
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT
y = _mm256_cvtepi32_ps(imm2);
imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); // NOLINT
/* get the swap sign flag */
imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask */
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT
imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
......@@ -534,9 +537,9 @@ v8sf cos256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
......@@ -545,26 +548,26 @@ v8sf cos256_ps(v8sf x) { // any x
x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0;
y = *(v8sf *)_ps256_coscof_p0; // NOLINT
v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
......@@ -595,42 +598,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
sign_bit_sin = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT
/* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
sign_bit_sin =
_mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); // NOLINT
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT
#ifdef __AVX2__
/* store the integer part of y in imm2 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT
y = _mm256_cvtepi32_ps(imm2);
imm4 = imm2;
/* get the swap sign flag for the sine */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 29);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
/* get the polynom selection mask for the sine*/
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT
// v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
......@@ -638,16 +642,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
imm4_1 = imm2_1;
imm4_2 = imm2_2;
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
......@@ -659,9 +663,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
......@@ -670,15 +674,15 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
x = _mm256_add_ps(x, xmm3);
#ifdef __AVX2__
imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); // NOLINT
imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); // NOLINT
imm4 = avx2_mm256_slli_epi32(imm4, 29);
#else
imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); // NOLINT
imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); // NOLINT
imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); // NOLINT
imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); // NOLINT
imm4_1 = _mm_slli_epi32(imm4_1, 29);
imm4_2 = _mm_slli_epi32(imm4_2, 29);
......@@ -692,25 +696,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v8sf z = _mm256_mul_ps(x, x);
y = *(v8sf *)_ps256_coscof_p0;
y = *(v8sf *)_ps256_coscof_p0; // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
......@@ -729,3 +733,6 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
*s = _mm256_xor_ps(xmm1, sign_bit_sin);
*c = _mm256_xor_ps(xmm2, sign_bit_cos);
}
} // namespace lite
} // namespace paddle
......@@ -83,14 +83,11 @@ class KernelBase {
#if defined(LITE_WITH_CUDA)
WorkSpace::Global_CUDA().AllocReset();
#endif
#ifdef LITE_WITH_PROFILE
CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
"When LITE_WITH_PROFILE is defined, please set a "
"Profiler for Instruction.";
profiler_->StartTiming(profile_id_, ctx_.get());
profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
Run();
profiler_->StopTiming(profile_id_, ctx_.get());
profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
#else
Run();
#endif
......
......@@ -120,6 +120,7 @@ class Buffer {
if (space_ > 0) {
TargetFree(target_, data_);
}
data_ = nullptr;
target_ = TargetType::kHost;
space_ = 0;
}
......
......@@ -28,36 +28,55 @@ auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
};
}
int Profiler::NewTimer(const OpCharacter& ch) {
StatisUnit unit;
unit.character = ch;
std::map<Type, std::string> TypeStr{
{Type::kUnk, "Unknown"},
{Type::kCreate, "Create"},
{Type::kDispatch, "Dispatch"},
};
StatisUnit::StatisUnit(const OpCharacter& ch) : character(ch) {
create_t.reset(new DeviceTimer<TargetType::kHost>());
if (ch.target == TargetType::kCUDA) {
#ifdef LITE_WITH_CUDA
unit.timer.reset(new DeviceTimer<TargetType::kCUDA>());
dispatch_t.reset(new DeviceTimer<TargetType::kCUDA>());
#else
LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
"default x86 timer is used instead.";
#endif
} else {
unit.timer.reset(new DeviceTimer<TargetType::kHost>());
dispatch_t.reset(new DeviceTimer<TargetType::kHost>());
}
}
lite::profile::Timer* StatisUnit::Timer(Type type) {
if (type == Type::kCreate) {
return create_t.get();
} else if (type == Type::kDispatch) {
return dispatch_t.get();
}
LOG(FATAL) << "Timer cannot be returned for unknown platforms.";
return nullptr;
}
int Profiler::NewTimer(const OpCharacter& ch) {
StatisUnit unit(ch);
units_.push_back(std::move(unit));
return units_.size() - 1;
}
void Profiler::StartTiming(const int index, KernelContext* ctx) {
void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
units_[index].timer->Start(ctx);
units_[index].Timer(type)->Start(ctx);
}
float Profiler::StopTiming(const int index, KernelContext* ctx) {
float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
return units_[index].timer->Stop(ctx);
return units_[index].Timer(type)->Stop(ctx);
}
std::string Profiler::Summary(bool concise, size_t w) {
std::string Profiler::Summary(Type type, bool concise, size_t w) {
using std::setw;
using std::left;
using std::fixed;
......@@ -65,12 +84,14 @@ std::string Profiler::Summary(bool concise, size_t w) {
std::string title;
// Title.
if (concise) {
ss << "Timing cycle = " << units_.front().timer->LapTimes().Size()
ss << "Timing cycle = " << units_.front().Timer(type)->LapTimes().Size()
<< std::endl;
ss << "===== Concise Profiler Summary: " << name_ << ", Exclude " << w
ss << "===== Concise " << TypeStr.find(type)->second
<< " Profiler Summary: " << name_ << ", Exclude " << w
<< " warm-ups =====" << std::endl;
} else {
ss << "===== Detailed Profiler Summary: " << name_ << ", Exclude " << w
ss << "===== Detailed " << TypeStr.find(type)->second
<< " Profiler Summary: " << name_ << ", Exclude " << w
<< " warm-ups =====" << std::endl;
}
ss << setw(25) << left << "Operator Type"
......@@ -84,16 +105,16 @@ std::string Profiler::Summary(bool concise, size_t w) {
if (concise) {
std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
for (auto& unit : units_) {
auto ch = summary.find(unit.character);
auto ch = summary.find(unit.Character());
if (ch != summary.end()) {
ch->second.avg += unit.timer->LapTimes().Avg(w);
ch->second.min += unit.timer->LapTimes().Min(w);
ch->second.max += unit.timer->LapTimes().Max(w);
ch->second.avg += unit.Timer(type)->LapTimes().Avg(w);
ch->second.min += unit.Timer(type)->LapTimes().Min(w);
ch->second.max += unit.Timer(type)->LapTimes().Max(w);
} else {
TimeInfo info({unit.timer->LapTimes().Avg(w),
unit.timer->LapTimes().Min(w),
unit.timer->LapTimes().Max(w)});
summary.insert({unit.character, info});
TimeInfo info({unit.Timer(type)->LapTimes().Avg(w),
unit.Timer(type)->LapTimes().Min(w),
unit.Timer(type)->LapTimes().Max(w)});
summary.insert({unit.Character(), info});
}
}
for (const auto& item : summary) {
......@@ -109,14 +130,15 @@ std::string Profiler::Summary(bool concise, size_t w) {
}
} else {
for (auto& unit : units_) {
const auto& times = unit.Timer(type)->LapTimes();
// clang-format off
ss << setw(25) << left << fixed << unit.character.op_type \
<< " " << setw(40) << left << fixed << unit.character.kernel_name \
<< " " << setw(12) << left << fixed << unit.character.remark \
<< " " << setw(12) << left << fixed << unit.timer->LapTimes().Avg(w) \
<< " " << setw(12) << left << fixed << unit.timer->LapTimes().Min(w) \
<< " " << setw(12) << left << fixed << unit.timer->LapTimes().Max(w) \
<< " " << setw(12) << left << fixed << unit.timer->LapTimes().Last(w) \
ss << setw(25) << left << fixed << unit.Character().op_type \
<< " " << setw(40) << left << fixed << unit.Character().kernel_name \
<< " " << setw(12) << left << fixed << unit.Character().remark \
<< " " << setw(12) << left << fixed << times.Avg(w) \
<< " " << setw(12) << left << fixed << times.Min(w) \
<< " " << setw(12) << left << fixed << times.Max(w) \
<< " " << setw(12) << left << fixed << times.Last(w) \
<< std::endl;
// clang-format on
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
......@@ -22,6 +23,14 @@ namespace paddle {
namespace lite {
namespace profile {
enum class Type {
kUnk = 0,
kCreate,
kDispatch,
};
extern std::map<Type, std::string> TypeStr;
struct TimeInfo {
float avg;
float min;
......@@ -35,8 +44,15 @@ struct OpCharacter {
std::string remark{std::string("N/A")};
};
struct StatisUnit {
std::unique_ptr<Timer> timer;
class StatisUnit final {
public:
explicit StatisUnit(const OpCharacter& ch);
lite::profile::Timer* Timer(Type type);
const OpCharacter& Character() const { return character; }
protected:
std::unique_ptr<lite::profile::Timer> create_t;
std::unique_ptr<lite::profile::Timer> dispatch_t;
OpCharacter character;
};
......@@ -45,9 +61,9 @@ class Profiler final {
Profiler() = default;
explicit Profiler(const std::string& name) : name_(name) {}
int NewTimer(const OpCharacter& ch);
void StartTiming(const int index, KernelContext* ctx);
float StopTiming(const int index, KernelContext* ctx);
std::string Summary(bool concise = true, size_t warm_up = 10);
void StartTiming(Type type, const int index, KernelContext* ctx);
float StopTiming(Type type, const int index, KernelContext* ctx);
std::string Summary(Type type, bool concise = true, size_t warm_up = 10);
private:
std::string name_{std::string("N/A")};
......
......@@ -69,10 +69,10 @@ TEST(profiler, real_latency) {
ch.op_type = "operator/1";
ch.kernel_name = "kernel/1";
int idx = profiler.NewTimer(ch);
profiler.StartTiming(idx, &ctx);
profiler.StartTiming(Type::kDispatch, idx, &ctx);
std::this_thread::sleep_for(std::chrono::milliseconds(10));
profiler.StopTiming(idx, &ctx);
std::cout << profiler.Summary();
profiler.StopTiming(Type::kDispatch, idx, &ctx);
std::cout << profiler.Summary(Type::kDispatch);
}
#endif
......
......@@ -147,7 +147,7 @@ void RuntimeProgram::Run() {
#endif // LITE_WITH_PROFILE
}
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(false, 0);
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
#endif // LITE_WITH_PROFILE
}
......@@ -252,8 +252,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
}
void Instruction::Run() {
#ifdef LITE_WITH_PROFILE
CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
"When LITE_WITH_PROFILE is defined, please set a "
"Profiler for Instruction.";
profiler_->StartTiming(
profile::Type::kCreate, profile_id_, kernel_->mutable_context());
#endif
CHECK(op_) << "op null";
CHECK(kernel_) << "kernel null";
if (first_epoch_) {
first_epoch_ = false;
CHECK(op_->CheckShape());
......@@ -263,10 +271,7 @@ void Instruction::Run() {
return;
}
// VLOG(4) << "kernel launch";
op_->InferShape();
// VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
// << TargetToStr(kernel_->target());
kernel_->Launch();
has_run_ = true;
}
......
......@@ -143,7 +143,8 @@ class LITE_API RuntimeProgram {
}
~RuntimeProgram() {
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary();
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
#endif // LITE_WITH_PROFILE
}
......
......@@ -233,6 +233,10 @@ class TensorLite {
(static_cast<char *>(buffer_->data()) + offset_));
}
void clear() {
buffer_->Free();
offset_ = 0;
}
size_t data_size() const { return this->dims().production(); }
size_t memory_size() const { return memory_size_; }
......
......@@ -34,6 +34,9 @@ void ConditionalBlockCompute::PrepareForRun() {
}
void ConditionalBlockCompute::Run() {
auto& param = Param<operators::ConditionalBlockParam>();
for (auto& out : param.outs) {
out->clear();
}
bool need_run = true;
if (param.is_scalar_condition) {
auto* cond = param.cond;
......
......@@ -82,6 +82,10 @@ void SplitLodTensorCompute::Run() {
ranges.begin(), ranges.end(), 0UL, [](size_t a, const CopyRange &b) {
return a + b.end - b.begin;
});
if (height == 0) {
out->clear();
continue;
}
auto x_dim = x->dims();
x_dim[0] = static_cast<int64_t>(height);
out->Resize(x_dim);
......
......@@ -54,12 +54,12 @@ REGISTER_LITE_KERNEL(unsqueeze,
kNCHW,
paddle::lite::kernels::host::UnsqueezeCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindInput("AxesTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("AxesTensorList",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(unsqueeze2,
......@@ -68,11 +68,11 @@ REGISTER_LITE_KERNEL(unsqueeze2,
kNCHW,
paddle::lite::kernels::host::Unsqueeze2Compute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindInput("AxesTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("AxesTensorList",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -54,7 +54,8 @@ REGISTER_LITE_KERNEL(yolo_box,
paddle::lite::kernels::arm::YoloBoxCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("ImgSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -156,8 +156,8 @@ void SoftmaxCompute::PrepareForRun() {
cudaGetDevice(&device_id);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device_id);
sharedmem_size = deviceProp.sharedMemPerBlock;
max_dimsize = sharedmem_size / sizeof(float) / CUDA_NUM_THREADS;
sharedmem_size_ = deviceProp.sharedMemPerBlock;
max_dimsize_ = sharedmem_size_ / sizeof(float) / CUDA_NUM_THREADS;
}
void SoftmaxCompute::Run() {
......@@ -174,29 +174,27 @@ void SoftmaxCompute::Run() {
int outer_num = x_dims.Slice(0, axis).production();
int inner_num = x_dims.Slice(axis + 1, x_rank).production();
int total_threads = inner_num * outer_num;
int axis_size = x_dims[axis];
axis_size_ = x_dims[axis];
const int threads = CUDA_NUM_THREADS;
const int blocks = (total_threads + threads - 1) / threads;
auto input_data = param.x->data<float>();
auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
if (axis_size <= max_dimsize) {
int use_sharemem_size = axis_size * threads * sizeof(float);
if (axis_size_ <= max_dimsize_) {
int use_sharemem_size = axis_size_ * threads * sizeof(float);
sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
total_threads,
input_data,
output_data,
inner_num,
outer_num,
axis_size);
axis_size_);
} else {
//! re_alloc device memory
Tensor tmax_data;
Tensor tsum_data;
tmax_data.Resize({1, 1, 1, outer_num * inner_num});
tsum_data.Resize({1, 1, 1, outer_num * inner_num});
auto max_data = tmax_data.mutable_data<float>(TARGET(kCUDA));
auto sum_data = tsum_data.mutable_data<float>(TARGET(kCUDA));
tmax_data_.Resize({1, 1, 1, outer_num * inner_num});
tsum_data_.Resize({1, 1, 1, outer_num * inner_num});
auto max_data = tmax_data_.mutable_data<float>(TARGET(kCUDA));
auto sum_data = tsum_data_.mutable_data<float>(TARGET(kCUDA));
//! firstly, get maximum data
float min_data = std::numeric_limits<float>::lowest();
softmax_max_kernel<float><<<blocks, threads, 0, stream>>>(total_threads,
......@@ -205,7 +203,7 @@ void SoftmaxCompute::Run() {
min_data,
inner_num,
outer_num,
axis_size);
axis_size_);
//! then, compute exp and sum data
softmax_sub_exp_sum_kernel<float><<<blocks, threads, 0, stream>>>(
total_threads,
......@@ -215,10 +213,10 @@ void SoftmaxCompute::Run() {
sum_data,
inner_num,
outer_num,
axis_size);
axis_size_);
//! last, compute divided output
softmax_divid_output_kernel<float><<<blocks, threads, 0, stream>>>(
total_threads, output_data, sum_data, inner_num, outer_num, axis_size);
total_threads, output_data, sum_data, inner_num, outer_num, axis_size_);
}
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
......
......@@ -30,9 +30,11 @@ class SoftmaxCompute
virtual ~SoftmaxCompute() = default;
private:
size_t sharedmem_size;
int num_threads;
int max_dimsize;
lite::Tensor tmax_data_;
lite::Tensor tsum_data_;
size_t sharedmem_size_;
int max_dimsize_;
int axis_size_;
};
} // namespace cuda
......
......@@ -28,12 +28,14 @@ namespace subgraph {
class Engine {
public:
Engine(int block_idx,
Engine(KernelContext *ctx,
int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
lite::Scope *scope)
: block_idx_(block_idx),
: ctx_(ctx),
block_idx_(block_idx),
block_desc_(block_desc),
input_names_(input_names),
output_names_(output_names),
......@@ -55,6 +57,7 @@ class Engine {
virtual bool InputShapeChanged();
KernelContext *ctx_{nullptr};
int block_idx_;
cpp::BlockDesc *block_desc_;
std::vector<std::string> input_names_;
......
......@@ -207,7 +207,8 @@ int SubgraphEngine::LaunchDeviceProgram() {
void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(param.sub_block_idx,
engine_.reset(new SubgraphEngine(ctx_.get(),
param.sub_block_idx,
param.sub_block_desc,
param.input_data_names,
param.output_data_names,
......
......@@ -29,13 +29,14 @@ namespace npu {
class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(int block_idx,
SubgraphEngine(KernelContext *ctx,
int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
Scope *scope)
: subgraph::Engine(
block_idx, block_desc, input_names, output_names, scope) {}
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected:
int BuildDeviceProgram() override;
......
......@@ -13,10 +13,13 @@
// limitations under the License.
#include "lite/kernels/x86/gru_compute.h"
#include "lite/utils/env.h"
DEFINE_int32(paddle_num_threads,
1,
"Number of threads for each paddle instance.");
// DEFINE_int32(paddle_num_threads,
// 1,
// "Number of threads for each paddle instance.");
int32_t paddle_num_threads =
paddle::lite::GetIntFromEnv("paddle_num_threads", 1);
REGISTER_LITE_KERNEL(gru,
kX86,
......
......@@ -26,7 +26,8 @@
#include "lite/core/types.h"
#include "lite/fluid/eigen.h"
DECLARE_int32(paddle_num_threads);
// DECLARE_int32(paddle_num_threads);
extern int32_t paddle_num_threads;
namespace paddle {
namespace lite {
......@@ -109,7 +110,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
#ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM
if (FLAGS_paddle_num_threads >= 4) {
if (paddle_num_threads >= 4) {
auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix,
1 /*height of C*/,
......
......@@ -49,9 +49,10 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto out_type = kernel->GetOutputDeclType("Out");
CHECK(out_type->precision() == PRECISION(kFloat));
CHECK(out_type->layout() == DATALAYOUT(kNCHW));
auto out = scope->FindMutableTensor(out_name);
auto out_dims = out->dims();
auto transpose_x = op_info->GetAttr<bool>("transpose_X");
CHECK(!transpose_x) << "XPU only support transpose_x == true now";
auto transpose_y = op_info->GetAttr<bool>("transpose_Y");
auto alpha = op_info->GetAttr<float>("alpha");
......@@ -71,11 +72,68 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
y_node = graph->AddNode(y_name, y_dims);
}
auto matmul_node =
graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y);
graph->AddNode(out_name, graph->builder_.CreateScale(matmul_node, alpha));
return SUCCESS;
// Matmul node
if (x_dims.size() > 2 && y_dims.size() >= 2) {
// x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
// x: [B, M, K], y: [K, N], out: [B, M, N]
// Reshape and transposed X node
if (x_dims.size() != 3) {
auto m = static_cast<int>(x_dims[x_dims.size() - 2]);
auto k = static_cast<int>(x_dims[x_dims.size() - 1]);
x_node =
graph->AddNode(x_name + "/reshape",
graph->builder_.CreateReshape(*x_node, {-1, m, k}));
if (transpose_x) {
x_node =
graph->AddNode(x_name + "/reshape/transpose",
graph->builder_.CreateTranspose(*x_node, {0, 2, 1}));
}
}
// Reshape and transposed Y node
if (y_dims.size() != 3) {
auto k = static_cast<int>(y_dims[y_dims.size() - 2]);
auto n = static_cast<int>(y_dims[y_dims.size() - 1]);
y_node =
graph->AddNode(y_name + "/reshape",
graph->builder_.CreateReshape(*y_node, {-1, k, n}));
if (!transpose_y) {
y_node =
graph->AddNode(y_name + "/reshape/transpose",
graph->builder_.CreateTranspose(*y_node, {0, 2, 1}));
}
}
// Matmul node
auto matmul_node = graph->AddNode(
out_name, graph->builder_.CreateBatchMatmul(*x_node, *y_node));
if (fabs(alpha - 1) > 1e-6f) {
matmul_node = graph->AddNode(
out_name, graph->builder_.CreateScale(*matmul_node, alpha));
}
if (out_dims.size() != 3) {
graph->AddNode(out_name,
graph->builder_.CreateReshape(
*matmul_node, CvtShape<xtcl::Integer>(out_dims)));
}
} else if (x_dims.size() == 2 && y_dims.size() == 2) {
// x: [M, K], y: [K, N], out: [M, N]
if (transpose_x) {
x_node = graph->AddNode(x_name + "/transpose",
graph->builder_.CreateTranspose(*x_node, {1, 0}));
}
auto matmul_node = graph->AddNode(
out_name,
graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y));
if (fabs(alpha - 1) > 1e-6f) {
matmul_node = graph->AddNode(
out_name, graph->builder_.CreateScale(*matmul_node, alpha));
}
} else if (x_dims.size() == 1 && y_dims.size() == 1) {
// x: [K], y: [K], out: [1]
// x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
LOG(FATAL) << "[XPU] Not supported.";
return FAILED;
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace xpu
......
......@@ -67,15 +67,27 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
x_node =
graph->AddNode(x_name + "/reshape",
graph->builder_.CreateReshape(
*x_node, {-1, static_cast<int>(y_matrix_dims[0])}));
*x_node, {-1, static_cast<int>(x_matrix_dims[1])}));
}
// Y node
auto y_const_node = graph->AddNode(y_name, *y, y_matrix_dims);
std::shared_ptr<xtcl::xExpr> y_node = nullptr;
if (graph->HasNode(y_name)) {
y_node = graph->GetNode(y_name);
} else {
y_node = graph->AddNode(y_name, y_dims);
}
// Flatten Y node
if (y_dims.size() != 2) {
y_node =
graph->AddNode(y_name + "/reshape",
graph->builder_.CreateReshape(
*y_node, {static_cast<int>(y_matrix_dims[0]), -1}));
}
// Reshape the matmul node with the inferred shape as the output node
auto matmul_node = graph->AddNode(
out_name, graph->builder_.CreateMatmul2D(*x_node, *y_const_node, false));
out_name, graph->builder_.CreateMatmul2D(*x_node, *y_node, false));
if (out_dims.size() != 2) {
graph->AddNode(out_name,
graph->builder_.CreateReshape(
......
......@@ -197,7 +197,8 @@ int SubgraphEngine::LaunchDeviceProgram() {
void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(param.sub_block_idx,
engine_.reset(new SubgraphEngine(ctx_.get(),
param.sub_block_idx,
param.sub_block_desc,
param.input_data_names,
param.output_data_names,
......
......@@ -29,13 +29,14 @@ namespace xpu {
class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(int block_idx,
SubgraphEngine(KernelContext *ctx,
int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
Scope *scope)
: subgraph::Engine(
block_idx, block_desc, input_names, output_names, scope) {}
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected:
int BuildDeviceProgram() override;
......
......@@ -50,6 +50,7 @@ add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})
# 2.basic ops not used in basic models
add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
......@@ -78,11 +79,9 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP
add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS})
add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS})
add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS})
add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/operators/attention_padding_mask_op.h"
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/scope.h"
......@@ -39,7 +40,8 @@ bool AttentionPaddingMaskOp::InferShape() const {
<< "Mismatch batch size, bottom0: " << att_batch
<< ", bottom1: " << src_batch;
param_.pad_begin->Resize({static_cast<int64_t>(src_batch)});
param_.pad_begin->Resize(
std::vector<int64_t>({static_cast<int64_t>(src_batch)}));
param_.Out->Resize(param_.X->dims());
param_.Out->set_lod(param_.X->lod());
......
......@@ -46,8 +46,9 @@ bool InstanceNormOp::InferShape() const {
auto x_dims = param_.x->dims();
int64_t batch_size = x_dims[0];
int64_t channel_size = x_dims[1];
param_.saved_mean->Resize({batch_size * channel_size});
param_.saved_variance->Resize({batch_size * channel_size});
param_.saved_mean->Resize(std::vector<int64_t>({batch_size * channel_size}));
param_.saved_variance->Resize(
std::vector<int64_t>({batch_size * channel_size}));
param_.out->Resize(x_dims);
return true;
}
......
......@@ -50,7 +50,7 @@ bool ReduceProdOpLite::InferShape() const {
if (keep_dim) {
out->Resize({static_cast<int64_t>(x_rank), 1});
} else {
out->Resize({1});
out->Resize(std::vector<int64_t>({1L}));
}
} else {
auto dims_vector = x_dims.Vectorize();
......
......@@ -30,6 +30,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
if(LITE_BUILD_EXTRA)
lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <cmath>
#include <string>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
#include "lite/tests/utils/fill_data.h"
namespace paddle {
namespace lite {
class MulComputeTester : public arena::TestCase {
protected:
// common attributes for this op.
std::string type_ = "mul";
std::string x_ = "x";
std::string y_ = "y";
std::string out_ = "out";
DDim x_dims_{{1, 2}};
DDim y_dims_{{2, 1}};
int x_num_col_dims_{1};
int y_num_col_dims_{1};
public:
MulComputeTester(const Place& place,
const std::string& alias,
DDim x_dims,
DDim y_dims,
int x_num_col_dims,
int y_num_col_dims)
: TestCase(place, alias),
x_dims_(x_dims),
y_dims_(y_dims),
x_num_col_dims_(x_num_col_dims),
y_num_col_dims_(y_num_col_dims) {}
void RunBaseline(Scope* scope) override {
auto* x = scope->FindTensor(x_);
auto* y = scope->FindTensor(y_);
auto x_mat_dims = x_dims_.Flatten2D(x_num_col_dims_);
auto y_mat_dims = y_dims_.Flatten2D(y_num_col_dims_);
CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
auto* out = scope->NewTensor(out_);
CHECK(out);
std::vector<int64_t> out_shape;
for (int i = 0; i < x_num_col_dims_; i++) {
out_shape.push_back(x_dims_[i]);
}
for (int i = y_num_col_dims_; i < y_dims_.size(); i++) {
out_shape.push_back(y_dims_[i]);
}
out->Resize(DDim(out_shape));
auto x_data = x->data<float>();
auto y_data = y->data<float>();
auto* out_data = out->mutable_data<float>();
const int M = x_mat_dims[0];
const int K = x_mat_dims[1];
const int N = y_mat_dims[1];
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
out_data[m * N + n] = 0;
for (int k = 0; k < K; ++k) {
out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
}
}
}
}
void PrepareOpDesc(cpp::OpDesc* op_desc) {
op_desc->SetType(type_);
op_desc->SetInput("X", {x_});
op_desc->SetInput("Y", {y_});
op_desc->SetOutput("Out", {out_});
op_desc->SetAttr("x_num_col_dims", x_num_col_dims_);
op_desc->SetAttr("y_num_col_dims", y_num_col_dims_);
}
void PrepareData() override {
std::vector<float> x(x_dims_.production());
fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
SetCommonTensor(x_, x_dims_, x.data());
std::vector<float> y(y_dims_.production());
fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
SetCommonTensor(y_, y_dims_, y.data());
}
};
void TestMul(const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& y_dims,
int x_num_col_dims,
int y_num_col_dims,
const Place& place,
float abs_error) {
std::unique_ptr<arena::TestCase> tester(new MulComputeTester(place,
"def",
DDim(x_dims),
DDim(y_dims),
x_num_col_dims,
y_num_col_dims));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
TEST(Mul, precision) {
LOG(INFO) << "test mul op";
float abs_error = 2e-5;
Place place;
#if defined(LITE_WITH_XPU)
place = TARGET(kXPU);
#else
return;
#endif
TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error);
TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error);
TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error);
TestMul({4, 60}, {5, 4, 3, 2}, 1, 3, place, abs_error);
TestMul({2, 3, 4, 5}, {60, 4}, 1, 1, place, abs_error);
TestMul({2, 3, 4, 5}, {20, 4}, 2, 1, place, abs_error);
TestMul({2, 3, 4, 5}, {5, 4}, 3, 1, place, abs_error);
TestMul({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1, place, abs_error);
TestMul({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2, place, abs_error);
TestMul({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2, place, abs_error);
}
} // namespace lite
} // namespace paddle
......@@ -107,6 +107,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
}
void PrepareData() override {
SetPrecisionType(out_, PRECISION(kFloat));
std::vector<float> in_data(dims_.production());
for (int i = 0; i < dims_.production(); ++i) {
in_data[i] = i;
......@@ -213,6 +214,7 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
}
void PrepareData() override {
SetPrecisionType(out_, PRECISION(kFloat));
std::vector<float> in_data(dims_.production());
for (int i = 0; i < dims_.production(); ++i) {
in_data[i] = i;
......
......@@ -1042,23 +1042,6 @@ function main {
build_test_arm_subtask_armlinux
shift
;;
build_test_arm_model_mobilenetv1)
build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
build_test_arm_subtask_model test_mobilenetv1_int8 MobileNetV1_quant
shift
;;
build_test_arm_model_mobilenetv2)
build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
shift
;;
build_test_arm_model_resnet50)
build_test_arm_subtask_model test_resnet50 resnet50
shift
;;
build_test_arm_model_inceptionv4)
build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
shift
;;
check_style)
check_style
shift
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <string>
namespace paddle {
namespace lite {
static std::string GetStringFromEnv(const std::string& str,
const std::string& def = "") {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return std::string(variable);
}
static bool GetBoolFromEnv(const std::string& str, bool def = false) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
if (strcmp(variable, "false") == 0 || strcmp(variable, "0") == 0) {
return false;
} else {
return true;
}
}
static int GetIntFromEnv(const std::string& str, int def = 0) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return atoi(variable);
}
static double GetDoubleFromEnv(const std::string& str, double def = 0.0) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return atof(variable);
}
static uint64_t GetUInt64FromEnv(const std::string& str, uint64_t def = 0ul) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return static_cast<uint64_t>(atol(variable));
}
} // namespace lite
} // namespace paddle
......@@ -18,6 +18,37 @@ limitations under the License. */
namespace paddle_mobile {
namespace framework {
void CLImage::PrintTensor(const CLImage &cl_image) const {
size_t width = cl_image.ImageDims()[0];
size_t height = cl_image.ImageDims()[1];
half_t *image_data = new half_t[height * width * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, image_data, 0, NULL, NULL);
CL_CHECK_ERRORS(err);
PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
"cl_image numel should not be 0 ");
float *tensor_data = new float[cl_image.numel()];
auto converter = cl_image.Converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
cl_image.dims());
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < cl_image.numel(); i++) {
printf("%f \n", tensor_data[i]);
}
delete[](tensor_data);
delete[](image_data);
}
void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
cl_command_queue commandQueue, cl_kernel kernel) {
tensor->mutable_data<float>();
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <iostream>
#include <memory>
#include <vector>
......@@ -285,6 +286,7 @@ class CLImage {
cl_event GetClEvent() const { return cl_event_.get(); }
CLImageConverterBase *Converter() const { return image_converter_; }
void PrintTensor(const CLImage &cl_image) const;
private:
void InitCLImage(cl_context context, size_t width, size_t height,
......
......@@ -21,13 +21,14 @@ namespace framework {
const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
"%d. " \
"Exiting.\033[0m\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
}
} // namespace framework
......
......@@ -363,7 +363,10 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
DLOG << "InitNoPersistableMemory var " << var_desc->Name();
auto tensor = var->template GetMutable<LoDTensor>();
if (tensor->IsInitialized() && tensor->dims().size() == 4) {
DLOG << "var's tensor is Initialized or dims size != 4";
// don't change user's input and avoid memory leaks
if (feed_indices_.find(var_desc->Name()) != feed_indices_.end()) {
break;
}
DDim tensor_dim = tensor->dims();
DDim new_dim =
make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
......
......@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
cl_int status;
int index = 0;
if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
const int filter_height = param.Filter()->dims()[2];
const int filter_width = param.Filter()->dims()[3];
if (filter_height == 1 && filter_width == 1) {
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
......@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
if (filter_height == 3 && filter_width == 3) {
// normal conv
if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
param.Filter()->dims()[1] == param.Input()->dims()[1]) {
......@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status = clSetKernelArg(kernel, index++, sizeof(int), &group);
CL_CHECK_ERRORS(status);
}
} else if (filter_height != 3 && filter_width != 3) {
// not 3x3
if (param.Filter()->dims()[1] == 1 &&
param.Input()->dims()[1] == param.Output()->dims()[1]) {
// deepwise basic use in not 3x3
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
CL_CHECK_ERRORS(status);
}
}
status = clEnqueueNDRangeKernel(
......
......@@ -24,980 +24,1101 @@ conv_add_bn_relu
#include "cl_common.h"
__kernel void conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__kernel void conv_3x3(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height,
__private const int output_c,
__private const int filter_channel,
__private const int group) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int output_c, __private const int filter_channel,
__private const int group) {
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
half4 input[9];
if (group == 1) {
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
half4 output = 0.0f;
#endif
half4 input[9];
if (group == 1) {
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input[0] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler,
pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler,
pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler,
pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler,
pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
} else {
for (int i = 0; i < 4; i++) {
int used_input_channel_num =
(out_c * 4 + i) / (output_c / group) * filter_channel;
for (int f_c = 0; f_c < filter_channel; ++f_c) {
int input_c = used_input_channel_num + f_c;
int input_block = input_c / 4;
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input[0] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
half tmp_out = 0;
for (int j = 0; j < 9; j++) {
int2 pos_of_weight;
pos_of_weight.x = (f_c / 4) * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
half4 weight = read_imageh(filter, sampler, pos_of_weight);
int f_c_offset = f_c % 4;
half f_value;
if (f_c_offset == 0) {
f_value = weight.x;
} else if (f_c_offset == 1) {
f_value = weight.y;
} else if (f_c_offset == 2) {
f_value = weight.z;
} else if (f_c_offset == 3) {
f_value = weight.w;
}
int input_c_offset = input_c % 4;
half input_value;
if (input_c_offset == 0) {
input_value = input[j].x;
} else if (input_c_offset == 1) {
input_value = input[j].y;
} else if (input_c_offset == 2) {
input_value = input[j].z;
} else if (input_c_offset == 3) {
input_value = input[j].w;
}
tmp_out += f_value * input_value;
}
} else {
for (int i = 0; i < 4; i++) {
int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel;
for (int f_c = 0; f_c < filter_channel; ++f_c) {
int input_c = used_input_channel_num + f_c;
int input_block = input_c / 4;
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
half tmp_out = 0;
for (int j = 0; j < 9; j++) {
int2 pos_of_weight;
pos_of_weight.x = (f_c / 4) * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
half4 weight = read_imageh(filter, sampler, pos_of_weight);
int f_c_offset = f_c % 4;
half f_value;
if (f_c_offset == 0) {
f_value = weight.x;
} else if (f_c_offset == 1) {
f_value = weight.y;
} else if (f_c_offset == 2) {
f_value = weight.z;
} else if (f_c_offset == 3) {
f_value = weight.w;
}
int input_c_offset = input_c % 4;
half input_value;
if (input_c_offset == 0) {
input_value = input[j].x;
} else if (input_c_offset == 1) {
input_value = input[j].y;
} else if (input_c_offset == 2) {
input_value = input[j].z;
} else if (input_c_offset == 3) {
input_value = input[j].w;
}
tmp_out += f_value * input_value;
}
if (i == 0) {
output.x += tmp_out;
} else if (i == 1) {
output.y += tmp_out;
} else if (i == 2) {
output.z += tmp_out;
} else if (i == 3) {
output.w += tmp_out;
}
}
if (i == 0) {
output.x += tmp_out;
} else if (i == 1) {
output.y += tmp_out;
} else if (i == 2) {
output.z += tmp_out;
} else if (i == 3) {
output.w += tmp_out;
}
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
// dilation == 1
__kernel void conv_3x3spl(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
// dilation == 1
__kernel void conv_3x3spl(
__private const int item_ch, __private const int item_w,
__private const int item_h, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int pad, __private const int dilation,
__private const int in_ch, __private const int in_w,
__private const int in_h, __private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
}
half4 output[5];
output[0] =
read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3
input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] =
read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] =
read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] =
read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] =
read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] =
read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] = mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] = mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] = mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] = mad(scale, output[4], biase);
}
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] = mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] = mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] = mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] = mad(scale, output[4], biase);
}
#endif
#ifdef RELU
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
}
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
__kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
__kernel void depth_conv_3x3(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const int batch_index = out_nh / output_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int out_nh_in_one_batch = out_nh % output_height;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
half4 output = 0.0f;
#endif
const int filter_width = 3;
const int filter_height = 3;
int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x ;
int filter_y = pos_in_filter_block.y ;
half4 inputs[9];
inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
/*
if (output_pos.x == 112 && output_pos.y == 0) {
half4 input1 = inputs[3];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 3 - %v4hlf \n", in);
printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
}
*/
inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
half4 filters[9];
filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y));
filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
for(int i = 0 ;i < 9 ; i++){
output += inputs[i] * filters[i];
}
const int filter_width = 3;
const int filter_height = 3;
int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y;
half4 inputs[9];
inputs[0] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[1] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[2] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[3] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
/*
if (output_pos.x == 112 && output_pos.y == 0) {
half4 input1 = inputs[3];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 3 - %v4hlf \n", in);
printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
}
*/
inputs[4] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
inputs[5] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
inputs[6] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[7] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[8] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
half4 filters[9];
filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
for (int i = 0; i < 9; i++) {
output += inputs[i] * filters[i];
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
/*
if (output_pos.x == 112 && output_pos.y == 0) {
for (int i = 0; i < 9; ++i) {
half4 input1 = inputs[i];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 %d - %v4hlf \n", i, in);
}
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" depth wise output output4 = %v4hlf \n", out);
printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
}
*/
/*
if (output_pos.x == 112 && output_pos.y == 0) {
for (int i = 0; i < 9; ++i) {
half4 input1 = inputs[i];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 %d - %v4hlf \n", i, in);
}
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" depth wise output output4 = %v4hlf \n", out);
printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
}
*/
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
__private const int ou_w_blk,
__private const int ou_nh,
__read_only image2d_t input,
__read_only image2d_t filter,
__kernel void depth_conv_3x3s1(
__private const int ou_ch_blk, __private const int ou_w_blk,
__private const int ou_nh, __read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,/* of one block */
__private const int in_h, /* of one block */
__private const int ou_w,
__private const int ou_h) {
const int ou_ch_blk_id = get_global_id(0);
const int ou_w_blk_id = get_global_id(1);
const int ou_nh_id = get_global_id(2);
const int w_blk_size = 2;
const int batch_id = ou_nh_id / ou_h;
int ou_col_id = ou_w_blk_id * w_blk_size;
int ou_row_id = ou_nh_id % ou_h;
int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
// input pos in one block and on batch
int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
#ifdef BIASE_CH
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
half4 output[2] = {0.0f};
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int pad, __private const int dilation,
__private const int in_ch, __private const int in_w, /* of one block */
__private const int in_h, /* of one block */
__private const int ou_w, __private const int ou_h) {
half4 inputs[12];
const int ou_ch_blk_id = get_global_id(0);
const int ou_w_blk_id = get_global_id(1);
const int ou_nh_id = get_global_id(2);
const int w_blk_size = 2;
int filter_x = ou_ch_blk_id * 3;
int filter_y = 0;
half4 filters[9];
filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y));
filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
const int batch_id = ou_nh_id / ou_h;
int ou_col_id = ou_w_blk_id * w_blk_size;
int ou_row_id = ou_nh_id % ou_h;
int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id);
// input pos in one block and on batch
int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
output[0] = mad(inputs[0], filters[0], output[0]);
output[1] = mad(inputs[1], filters[0], output[1]);
#ifdef BIASE_CH
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
half4 output[2] = {0.0f};
#endif
output[0] = mad(inputs[1], filters[1], output[0]);
output[1] = mad(inputs[2], filters[1], output[1]);
half4 inputs[12];
output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]);
int filter_x = ou_ch_blk_id * 3;
int filter_y = 0;
half4 filters[9];
filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id);
filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
output[0] = mad(inputs[0], filters[0], output[0]);
output[1] = mad(inputs[1], filters[0], output[1]);
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[1], filters[1], output[0]);
output[1] = mad(inputs[2], filters[1], output[1]);
output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]);
output[0] = mad(inputs[4], filters[3], output[0]);
output[1] = mad(inputs[5], filters[3], output[1]);
filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
output[0] = mad(inputs[5], filters[4], output[0]);
output[1] = mad(inputs[6], filters[4], output[1]);
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[6], filters[5], output[0]);
output[1] = mad(inputs[7], filters[5], output[1]);
output[0] = mad(inputs[4], filters[3], output[0]);
output[1] = mad(inputs[5], filters[3], output[1]);
output[0] = mad(inputs[5], filters[4], output[0]);
output[1] = mad(inputs[6], filters[4], output[1]);
filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
output[0] = mad(inputs[6], filters[5], output[0]);
output[1] = mad(inputs[7], filters[5], output[1]);
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]);
output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]);
output[0] = mad(inputs[9], filters[7], output[0]);
output[1] = mad(inputs[10], filters[7], output[1]);
output[0] = mad(inputs[9], filters[7], output[0]);
output[1] = mad(inputs[10], filters[7], output[1]);
output[0] = mad(inputs[10], filters[8], output[0]);
output[1] = mad(inputs[11], filters[8], output[1]);
output[0] = mad(inputs[10], filters[8], output[0]);
output[1] = mad(inputs[11], filters[8], output[1]);
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
output[0] = mad(scale, output[0], biase);
if (ou_col_id + 1 < ou_w) {
output[1] = mad(scale, output[1], biase);
}
half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
output[0] = mad(scale, output[0], biase);
if (ou_col_id + 1 < ou_w) {
output[1] = mad(scale, output[1], biase);
}
#endif
#ifdef RELU
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[0] = activation(output[0]);
output[1] = activation(output[1]);
#endif
write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) {
write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
}
write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) {
write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
}
}
__kernel void conv_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__kernel void conv_1x1(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
half4 output = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
}
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
......@@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple(
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,__private const int input_c_origin,
__private const int dilation,
__private const int offset, __private const int input_c,
__private const int input_c_origin, __private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
__private const int output_width, __private const int output_height,
__private const int old_w) {
half zero = 0.0f;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
......@@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple(
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c , old_w);
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
......@@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple(
#ifdef BIASE_CH
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
#elif defined(BIASE_ELE)
half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
#else
half4 output0 = 0.0f;
......@@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple(
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
......@@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple(
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
output1 = mad(input1.x, weight0, output1);
......@@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple(
output1 = mad(input1.w, weight3, output1);
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
output2 = mad(input2.x, weight0, output2);
......@@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple(
output2 = mad(input2.w, weight3, output2);
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
output3 = mad(input3.x, weight0, output3);
......@@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple(
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
#endif
if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0);
}
if (out_w1 < old_w){
if (out_w1 < old_w) {
write_imageh(output_image, output_pos1, output1);
}
if (out_w2 < old_w){
if (out_w2 < old_w) {
write_imageh(output_image, output_pos2, output2);
}
if (out_w3 < old_w){
if (out_w3 < old_w) {
write_imageh(output_image, output_pos3, output3);
}
}
......@@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped(
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,__private const int input_c_origin,
__private const int dilation,
__private const int offset, __private const int input_c,
__private const int input_c_origin, __private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
__private const int output_width, __private const int output_height,
__private const int old_w) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
......@@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped(
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c , old_w);
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
......@@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped(
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = read_imageh(bias, sampler, output_pos1);
half4 output2 = read_imageh(bias, sampler, output_pos2);
half4 output3 = read_imageh(bias, sampler, output_pos3);
half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = read_imageh(bias, sampler, output_pos1);
half4 output2 = read_imageh(bias, sampler, output_pos2);
half4 output3 = read_imageh(bias, sampler, output_pos3);
#else
half4 output0 = 0.0f;
......@@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped(
int burndary_index = input_c * 4 - input_c_origin;
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
......@@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped(
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){
if (burndary_index==0){
if ((max_w_bound - pos_in.x - 1) < input_width &&
(max_w_bound - pos_in.x - 1) >= 0) {
if (burndary_index == 0) {
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
} else if (burndary_index==1){
} else if (burndary_index == 1) {
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(0.0f, weight3, output0);
} else if (burndary_index==2){
} else if (burndary_index == 2) {
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(0.0f, weight2, output0);
output0 = mad(0.0f, weight3, output0);
} else if (burndary_index==3){
} else if (burndary_index == 3) {
output0 = mad(input0.x, weight0, output0);
output0 = mad(0.0f, weight1, output0);
output0 = mad(0.0f, weight2, output0);
output0 = mad(0.0f, weight3, output0);
}
}else {
} else {
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
......@@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped(
}
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){
if (burndary_index==0){
if (abs(max_w_bound - pos_in.x) < input_width) {
if (burndary_index == 0) {
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
} else if (burndary_index==1){
} else if (burndary_index == 1) {
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(0.0f, weight3, output1);
} else if (burndary_index==2){
} else if (burndary_index == 2) {
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(0.0f, weight2, output1);
output1 = mad(0.0f, weight3, output1);
} else if (burndary_index==3){
} else if (burndary_index == 3) {
output1 = mad(input1.x, weight0, output1);
output1 = mad(0.0f, weight1, output1);
output1 = mad(0.0f, weight2, output1);
output1 = mad(0.0f, weight3, output1);
}
}else {
} else {
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
......@@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped(
}
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){
if (burndary_index==0){
if (abs(max_w_bound - pos_in.x) < input_width) {
if (burndary_index == 0) {
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2);
} else if (burndary_index==1){
} else if (burndary_index == 1) {
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(0.0f, weight3, output2);
} else if (burndary_index==2){
} else if (burndary_index == 2) {
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(0.0f, weight2, output2);
output2 = mad(0.0f, weight3, output2);
} else if (burndary_index==3){
} else if (burndary_index == 3) {
output2 = mad(input2.x, weight0, output2);
output2 = mad(0.0f, weight1, output2);
output2 = mad(0.0f, weight2, output2);
output2 = mad(0.0f, weight3, output2);
}
}else {
} else {
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
......@@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped(
}
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){
if (burndary_index==0){
if (abs(max_w_bound - pos_in.x) < input_width) {
if (burndary_index == 0) {
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3);
} else if (burndary_index==1){
} else if (burndary_index == 1) {
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(0.0f, weight3, output3);
} else if (burndary_index==2){
} else if (burndary_index == 2) {
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(0.0f, weight2, output3);
output3 = mad(0.0f, weight3, output3);
} else if (burndary_index==3){
} else if (burndary_index == 3) {
output3 = mad(input3.x, weight0, output3);
output3 = mad(0.0f, weight1, output3);
output3 = mad(0.0f, weight2, output3);
output3 = mad(0.0f, weight3, output3);
}
}else {
} else {
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
......@@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped(
}
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
#endif
if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0);
}
if (out_w1 < old_w){
if (out_w1 < old_w) {
write_imageh(output_image, output_pos1, output1);
}
if (out_w2 < old_w){
if (out_w2 < old_w) {
write_imageh(output_image, output_pos2, output2);
}
if (out_w3 < old_w){
if (out_w3 < old_w) {
write_imageh(output_image, output_pos3, output3);
}
}
__kernel void conv_7x7(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_7x7(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int j = 0; j < 7; j++){
for(int k = 0; k < 7; k++){
input = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for (int j = 0; j < 7; j++) {
for (int k = 0; k < 7; k++) {
input = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_7x7Pt1x2(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w1 = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w1 >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int out_w = out_w1 * 2;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w1 = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int out_w = out_w1 * 2;
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
half4 output0 = 0.0f;
half4 output1 = 0.0f;
half4 output0 = 0.0f;
half4 output1 = 0.0f;
#ifdef BIASE_CH
output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
output1 = output0;
output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
output1 = output0;
#elif defined(BIASE_ELE)
output0 = read_imageh(bias, sampler, output_pos);
output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
output0 = read_imageh(bias, sampler, output_pos);
output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
#else
output0 = 0.0f;
output1 = 0.0f;
#endif
half4 input[8];
half4 filter0[4];
half4 filter1[4];
half4 filter2[4];
half4 filter3[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int k = 0; k < 7; k++){
for (int j = 0; j < 8; j++) {
input[j] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
if (j < 7) {
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
output0.x += dot(input[j], filter0[0]);
output0.y += dot(input[j], filter0[1]);
output0.z += dot(input[j], filter0[2]);
output0.w += dot(input[j], filter0[3]);
}
if (j > 0) {
output1.x += dot(input[j], filter1[0]);
output1.y += dot(input[j], filter1[1]);
output1.z += dot(input[j], filter1[2]);
output1.w += dot(input[j], filter1[3]);
}
filter1[0] = filter0[0];
filter1[1] = filter0[1];
filter1[2] = filter0[2];
filter1[3] = filter0[3];
}
output0 = 0.0f;
output1 = 0.0f;
#endif
half4 input[8];
half4 filter0[4];
half4 filter1[4];
half4 filter2[4];
half4 filter3[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for (int k = 0; k < 7; k++) {
for (int j = 0; j < 8; j++) {
input[j] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
if (j < 7) {
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
output0.x += dot(input[j], filter0[0]);
output0.y += dot(input[j], filter0[1]);
output0.z += dot(input[j], filter0[2]);
output0.w += dot(input[j], filter0[3]);
}
if (j > 0) {
output1.x += dot(input[j], filter1[0]);
output1.y += dot(input[j], filter1[1]);
output1.z += dot(input[j], filter1[2]);
output1.w += dot(input[j], filter1[3]);
}
}
filter1[0] = filter0[0];
filter1[1] = filter0[1];
filter1[2] = filter0[2];
filter1[3] = filter0[3];
}
}
}
#ifdef BATCH_NORM
half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * s + b;
output1 = output1 * s + b;
half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * s + b;
output1 = output1 * s + b;
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output0 = activation(output0);
output1 = activation(output1);
#endif
write_imageh(output_image, output_pos, output0);
if ((output_pos.x + 1) % output_width != 0) {
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
}
write_imageh(output_image, output_pos, output0);
if ((output_pos.x + 1) % output_width != 0) {
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
}
}
// dilation == 1
__kernel void conv_7x7spl(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_7x7spl(
__private const int item_ch, __private const int item_w,
__private const int item_h, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
// filter
const int filter_w = 7;
const int filter_h = 7;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int pad, __private const int dilation,
__private const int in_ch, __private const int in_w,
__private const int in_h, __private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// filter
const int filter_w = 7;
const int filter_h = 7;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
}
half4 output[5];
output[0] =
read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3
input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] =
read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] =
read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] =
read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] =
read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] =
read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] = mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] = mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] = mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] = mad(scale, output[4], biase);
}
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] = mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] = mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] = mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] = mad(scale, output[4], biase);
}
#endif
#ifdef RELU
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
}
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
__kernel void conv_5x5(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_5x5(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int j = 0; j < 5; j++){
for(int k = 0; k < 5; k++){
input = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 2) * dilation, pos_in.y + (k - 2) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter_pos1.y = filter_n1 * 5 + filter_h;
filter_pos2.x = filter_c * 5 + filter_w;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos3.x = filter_c * 5 + filter_w;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for (int j = 0; j < 5; j++) {
for (int k = 0; k < 5; k++) {
input = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 2) * dilation,
pos_in.y + (k - 2) * dilation)),
(half4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter_pos1.y = filter_n1 * 5 + filter_h;
filter_pos2.x = filter_c * 5 + filter_w;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos3.x = filter_c * 5 + filter_w;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void convBNAdd_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__kernel void convBNAdd_3x3(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
half4 output = (half4)0.0f;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
half4 input[9];
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
half4 output = (half4)0.0f;
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
half4 input[9];
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
}
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c, 0));
output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos);
output += read_imageh(bias, sampler, output_pos);
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void convBNAdd_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__kernel void convBNAdd_1x1(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
half4 output = 0.0f;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
}
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c, 0));
output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos);
output += read_imageh(bias, sampler, output_pos);
#endif
#ifdef RELU
......@@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
}
__kernel void convBNAdd_1x1_spl(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int old_w) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
......@@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl(
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c , old_w);
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
half4 output0 = 0.0f;
half4 output1 = 0.0f;
......@@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl(
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
......@@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl(
output0 = mad(input0.w, weight3, output0);
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
//
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
......@@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl(
output1 = mad(input1.w, weight3, output1);
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
......@@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl(
output2 = mad(input2.w, weight3, output2);
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
......@@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl(
}
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE_CH
output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output0 += read_imageh(bias, sampler, output_pos0);
output1 += read_imageh(bias, sampler, output_pos1);
output2 += read_imageh(bias, sampler, output_pos2);
output3 += read_imageh(bias, sampler, output_pos3);
output0 += read_imageh(bias, sampler, output_pos0);
output1 += read_imageh(bias, sampler, output_pos1);
output2 += read_imageh(bias, sampler, output_pos2);
output3 += read_imageh(bias, sampler, output_pos3);
#endif
#ifdef RELU
......@@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl(
write_imageh(output_image, output_pos0, output0);
}
if (out_w1 < old_w){
if (out_w1 < old_w) {
write_imageh(output_image, output_pos1, output1);
}
if (out_w2 < old_w){
if (out_w2 < old_w) {
write_imageh(output_image, output_pos2, output2);
}
if (out_w3 < old_w){
if (out_w3 < old_w) {
write_imageh(output_image, output_pos3, output3);
}
}
__kernel void depth_conv(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int filter_width, __private const int filter_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y;
int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
int2 align = {filter_width / 2, filter_height / 2};
/* if (output_pos.x == 0 && output_pos.y == 0){
printf("align.x=%d align.y=%d \n ",align.x,align.y);
printf("stride=%d \n ",stride);
}*/
for (int fy = 0; fy < filter_height; ++fy) {
for (int fx = 0; fx < filter_width; ++fx) {
int x_off = fx - align.x;
int y_off = fy - align.y;
/* if (output_pos.x == 0 && output_pos.y == 0){
printf("fx=%d fy=%d \n ",fx,fy);
printf("x_off=%d y_off=%d \n ",x_off,y_off);
}*/
half4 in = select(
read_imageh(input, sampler,
(int2)(input_x_base + x_off, input_y_base + y_off)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + x_off < 0 ||
in_pos_in_one_block.y + y_off < 0 ||
in_pos_in_one_block.x + x_off >= input_width ||
in_pos_in_one_block.y + y_off >= input_height)
<< 15));
half4 f =
read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy));
output += in * f;
/*if (output_pos.x ==111 && output_pos.y == 0){
printf("in={ %f , %f , %f , %f } \n
",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w));
printf("filter={ %f , %f , %f , %f } \n
",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w));
printf("output={ %f , %f , %f , %f } \n
",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w));
}*/
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
}
\ No newline at end of file
......@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in * biase;
write_imageh(outputImage,coords,output);
}
__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
image2d_t outputImage, int w) {
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in * biase;
write_imageh(outputImage, coords, output);
}
__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x/w;
coords_bias.x = x / w;
coords_bias.y = 0;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords_bias);
half4 output = in * biase;
write_imageh(outputImage,coords,output);
write_imageh(outputImage, coords, output);
}
// etc : 1 1 1 72
// run time Y [value,0,0,0] * 72
__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias0;
int2 coords_bias1;
int2 coords_bias2;
int2 coords_bias3;
/* if (x == 0 && y == 0) {
half4 b = (half4){0, 0, 0, 0};
#define PPI(j, k) \
b = read_imageh(bias, sampler, (int2){j, k}); \
printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
convert_float(b.y), convert_float(b.z), convert_float(b.w));
for (int i = 0; i < 73; ++i) {
PPI(i, 0);
}
#undef PPI
}*/
coords_bias0.x = x / w * 4;
coords_bias0.y = 0;
coords_bias1.x = x / w * 4 + 1;
coords_bias1.y = 0;
coords_bias2.x = x / w * 4 + 2;
coords_bias2.y = 0;
coords_bias3.x = x / w * 4 + 3;
coords_bias3.y = 0;
half4 biase0 = read_imageh(bias, sampler, coords_bias0);
half4 biase1 = read_imageh(bias, sampler, coords_bias1);
half4 biase2 = read_imageh(bias, sampler, coords_bias2);
half4 biase3 = read_imageh(bias, sampler, coords_bias3);
/* if (x == 0 && y == 0) {
printf("bias0={ %f , %f , %f , %f }\n ",
convert_float(biase0.x), convert_float(biase0.y),
convert_float(biase0.z), convert_float(biase0.w));
printf("bias1={ %f , %f , %f , %f }\n ",
convert_float(biase1.x), convert_float(biase1.y),
convert_float(biase1.z), convert_float(biase1.w));
printf("bias2={ %f , %f , %f , %f }\n ",
convert_float(biase2.x), convert_float(biase2.y),
convert_float(biase2.z), convert_float(biase2.w));
printf("bias3={ %f , %f , %f , %f }\n ",
convert_float(biase3.x), convert_float(biase3.y),
convert_float(biase3.z), convert_float(biase3.w));
}*/
half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
half4 in = read_imageh(input, sampler, coords);
half4 output = mad(in, biase, 0);
write_imageh(outputImage, coords, output);
}
\ No newline at end of file
......@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
// other depthwise not with filter 3x3
DLOG << "depth_conv basic ";
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
......
......@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
......
......@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
DLOG << "init depwise conv basic";
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
......@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
param.NewBias());
break;
......
......@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
}
DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param);
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
......@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true);
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef ELEMENTWISEMUL_OP
#include "operators/kernel/elementwise_mul_kernel.h"
#include <framework/cl/cl_half.h>
#include <iostream>
#include "framework/cl/cl_image.h"
namespace paddle_mobile {
......@@ -23,19 +25,24 @@ namespace operators {
template <>
bool ElementwiseMulKernel<GPU_CL, float>::Init(
ElementwiseMulParam<GPU_CL> *param) {
DLOG << "-----init add-----";
framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
const_cast<framework::CLImage *>(param->InputY()));
if (bias->dims() == param->InputX()->dims()) {
DLOG << "init element wise mul";
this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
} else if (bias->dims().size() == 4) {
} else if (bias->dims().size() == 1) {
DLOG << "init channel_mul";
this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
} else if (bias->dims().size() == 2) {
// etc. input 1 72 28 28
// filter 1 72
DLOG << "init channel_mul_d2";
this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
} else {
DLOG << "error:bias dims is error";
PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
}
return true;
}
template <>
void ElementwiseMulKernel<GPU_CL, float>::Compute(
const ElementwiseMulParam<GPU_CL> &param) {
......@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 4) {
DLOG << "zp7 444";
} else if (bias->dims().size() == 1) {
DLOG << "channel mul";
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
......@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
DLOG << "dede:" << width << "," << height;
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 2) {
DLOG << "channel mul d2";
// etc. input 1 72 28 28
// filter 1 72 --> 1 1 1 72
DLOG << "input->ImageDims(): " << input->ImageDims();
DLOG << "bias->ImageDims(): " << bias->ImageDims();
DLOG << "out->ImageDims(): " << output->ImageDims();
DLOG << "channel mul d2";
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
int tensor_w = input->dims()[input->dims().size() - 1];
status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
reinterpret_cast<void *>(&input_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
reinterpret_cast<void *>(&bias_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
reinterpret_cast<void *>(&output_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int),
reinterpret_cast<void *>(&tensor_w));
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
// bias->PrintTensor(*bias);
} else {
DLOG << "error:bias dims is error";
PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
}
}
......
......@@ -489,6 +489,7 @@ class ConvParam : public OpParam {
EXEC_SLIDINGWINDOW5x5_FLOAT,
EXEC_SLIDINGWINDOW7x7_FLOAT,
EXEC_GEMM1x1s1_FLOAT,
EXEC_DEPTHWISEBASIC_FLOAT,
};
ExecMode &ExecMode() const { return exec_mode_; }
......
......@@ -216,4 +216,6 @@ void test(int argc, char *argv[]) {
std::cout << std::endl;
}
}
#else
int main() {}
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册