未验证 提交 273b1fa2 编写于 作者: Y ysh329 提交者: GitHub

[cherry-pick][OPENCL] remove conv redundant's for opencl kernel. test… (#3938)

* [cherry-pick][OPENCL] remove conv redundant's for opencl kernel. test=develop
Co-authored-by: Nxiebaiyuan <xiebaiyuan@qq.com>
上级 594175af
...@@ -27,6 +27,28 @@ ...@@ -27,6 +27,28 @@
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "argmax"; } std::string DebugString() const override { return "argmax"; }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.X->dims();
auto output_dims = param_.Out->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
ch->remark = "axis" + std::to_string(param_.Axis);
auto axis = param_.Axis;
if (axis < 0) {
axis += input_dims.size();
}
int max_num = 1;
for (int64_t i = axis + 1; i < input_dims.size(); i++)
max_num *= input_dims[i];
float gops = 1.0f;
for (int i = 1; i <= max_num; i++) gops *= i;
ch->macs = gops * output_dims.production();
}
#endif
private: private:
mutable ArgmaxParam param_; mutable ArgmaxParam param_;
}; };
...@@ -85,6 +107,13 @@ ...@@ -85,6 +107,13 @@
using param_t = operators::ArgmaxParam; using param_t = operators::ArgmaxParam;
void Run() override; void Run() override;
virtual ~ArgmaxCompute() = default; virtual ~ArgmaxCompute() = default;
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
std::string kernel_func_name_{"NotImplForArgmax"};
#endif
}; };
``` ```
- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件,主要实现Run函数。`Run()`函数调用paddlelite/lite/bachends/arm/math/argmax.h中的`argmax_func()`函数,根据输入计算输出。最后在argmax_compute.cc文件中,我们绑定argmax的输入输出(为tensor的输入参数都需要绑定),代码如下: - 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件,主要实现Run函数。`Run()`函数调用paddlelite/lite/bachends/arm/math/argmax.h中的`argmax_func()`函数,根据输入计算输出。最后在argmax_compute.cc文件中,我们绑定argmax的输入输出(为tensor的输入参数都需要绑定),代码如下:
...@@ -95,6 +124,9 @@ ...@@ -95,6 +124,9 @@
lite::Tensor* output = param.Out; lite::Tensor* output = param.Out;
int axis = param.Axis; int axis = param.Axis;
lite::arm::math::argmax_func(input, axis, output); lite::arm::math::argmax_func(input, axis, output);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "argmax_func";
#endif
return; return;
} }
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <gflags/gflags.h>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -25,6 +24,7 @@ ...@@ -25,6 +24,7 @@
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h" #include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
#include <gflags/gflags.h>
using paddle::lite::profile::Timer; using paddle::lite::profile::Timer;
...@@ -34,6 +34,10 @@ DEFINE_string(input_shape, ...@@ -34,6 +34,10 @@ DEFINE_string(input_shape,
DEFINE_bool(use_optimize_nb, DEFINE_bool(use_optimize_nb,
false, false,
"optimized & naive buffer model for mobile devices"); "optimized & naive buffer model for mobile devices");
DEFINE_string(backend,
"arm_cpu",
"choose backend for valid_places: arm_cpu | opencl. Compile "
"OpenCL version if you choose opencl");
DEFINE_string(arg_name, "", "the arg name"); DEFINE_string(arg_name, "", "the arg name");
namespace paddle { namespace paddle {
...@@ -49,9 +53,19 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -49,9 +53,19 @@ void OutputOptModel(const std::string& load_model_dir,
Place{TARGET(kX86), PRECISION(kInt64)}, Place{TARGET(kX86), PRECISION(kInt64)},
Place{TARGET(kHost), PRECISION(kFloat)}}); Place{TARGET(kHost), PRECISION(kFloat)}});
#else #else
config.set_valid_places({ if (FLAGS_backend == "opencl") {
Place{TARGET(kARM), PRECISION(kFloat)}, config.set_valid_places({
}); Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
TARGET(kARM), // enable kARM CPU kernel when no opencl kernel
});
} else { // arm_cpu
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
}
#endif #endif
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
...@@ -117,16 +131,40 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -117,16 +131,40 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
<< ", min time: " << ti.LapTimes().Min() << " ms" << ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.LapTimes().Max() << " ms."; << ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0); // output summary
auto out = output->data<float>(); size_t output_tensor_num = predictor->GetOutputNames().size();
LOG(INFO) << "out " << out[0]; LOG(INFO) << "output tensor num:" << output_tensor_num;
LOG(INFO) << "out " << out[1];
auto output_shape = output->shape(); for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
int output_num = 1; auto output_tensor = predictor->GetOutput(tidx);
for (int i = 0; i < output_shape.size(); ++i) { LOG(INFO) << "============= output tensor " << tidx << " =============";
output_num *= output_shape[i]; auto tensor_shape = output_tensor->shape();
std::string tensor_shape_str{""};
int output_tensor_numel = 1;
for (int i = 0; i < tensor_shape.size(); ++i) {
output_tensor_numel *= tensor_shape[i];
tensor_shape_str += std::to_string(tensor_shape[i]);
tensor_shape_str += (i < tensor_shape.size() - 1) ? "x" : "";
}
auto out_data = output_tensor->data<float>();
auto out_mean =
paddle::lite::compute_mean<float>(out_data, output_tensor_numel);
auto out_std_dev = paddle::lite::compute_standard_deviation<float>(
out_data, output_tensor_numel, true, out_mean);
VLOG(0) << "output tensor " << tidx << " dims:" << tensor_shape_str;
VLOG(0) << "output tensor " << tidx
<< " elements num:" << output_tensor_numel;
VLOG(0) << "output tensor " << tidx
<< " standard deviation:" << out_std_dev;
VLOG(0) << "output tensor " << tidx << " mean value:" << out_mean << "\n";
// print result
for (int i = 0; i < output_tensor_numel; ++i) {
VLOG(2) << "output_tensor->data<float>()[" << i
<< "]:" << output_tensor->data<float>()[i];
}
} }
LOG(INFO) << "output_num: " << output_num;
// please turn off memory_optimize_pass to use this feature. // please turn off memory_optimize_pass to use this feature.
if (FLAGS_arg_name != "") { if (FLAGS_arg_name != "") {
...@@ -162,6 +200,7 @@ int main(int argc, char** argv) { ...@@ -162,6 +200,7 @@ int main(int argc, char** argv) {
<< "--model_dir /path/to/your/model"; << "--model_dir /path/to/your/model";
exit(0); exit(0);
} }
std::string save_optimized_model_dir = ""; std::string save_optimized_model_dir = "";
if (FLAGS_use_optimize_nb) { if (FLAGS_use_optimize_nb) {
save_optimized_model_dir = FLAGS_model_dir; save_optimized_model_dir = FLAGS_model_dir;
......
...@@ -26,6 +26,7 @@ USE_MIR_PASS(argument_type_display_pass); ...@@ -26,6 +26,7 @@ USE_MIR_PASS(argument_type_display_pass);
USE_MIR_PASS(runtime_context_assign_pass); USE_MIR_PASS(runtime_context_assign_pass);
USE_MIR_PASS(graph_visualize_pass); USE_MIR_PASS(graph_visualize_pass);
USE_MIR_PASS(remove_tf_redundant_ops_pass);
USE_MIR_PASS(lite_conv_bn_fuse_pass); USE_MIR_PASS(lite_conv_bn_fuse_pass);
USE_MIR_PASS(lite_fc_fuse_pass); USE_MIR_PASS(lite_fc_fuse_pass);
USE_MIR_PASS(lite_shuffle_channel_fuse_pass); USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
......
...@@ -1254,6 +1254,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx, ...@@ -1254,6 +1254,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
} }
} }
template <>
void elementwise_div<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
*dout = *dinx / *diny;
dout++;
dinx++;
diny++;
}
}
template <> template <>
void elementwise_div<float>(const float* dinx, void elementwise_div<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -1306,6 +1319,28 @@ void elementwise_div<float>(const float* dinx, ...@@ -1306,6 +1319,28 @@ void elementwise_div<float>(const float* dinx,
} }
} }
template <>
void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* din_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int p = 0; p < num; p++) {
*dout_ptr = *din_ptr / diny_data;
dout_ptr++;
din_ptr++;
}
}
}
}
template <> template <>
void elementwise_div_broadcast<float>(const float* dinx, void elementwise_div_broadcast<float>(const float* dinx,
const float* diny, const float* diny,
......
...@@ -119,19 +119,13 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { ...@@ -119,19 +119,13 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
} }
} }
cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
size_t max_work_size, size_t max_work_size,
int divisor) { int divisor) {
int preferred_lws = 0; int preferred_lws = 0;
#if 1
auto gws0 = global_work_size[0]; auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1]; auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2]; auto gws2 = global_work_size[2];
#else
auto gws2 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws0 = global_work_size[2];
#endif
if (divisor > 1) { if (divisor > 1) {
max_work_size /= divisor; max_work_size /= divisor;
} }
...@@ -147,15 +141,40 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, ...@@ -147,15 +141,40 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
} }
#if 1
return cl::NDRange{static_cast<size_t>(gws0), return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1), static_cast<size_t>(gws1),
static_cast<size_t>(gws2)}; static_cast<size_t>(gws2)};
#else }
cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
auto gws2 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws0 = global_work_size[2];
if (divisor > 1) {
max_work_size /= divisor;
}
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
return cl::NDRange{static_cast<size_t>(gws2), return cl::NDRange{static_cast<size_t>(gws2),
static_cast<size_t>(gws1), static_cast<size_t>(gws1),
static_cast<size_t>(gws0)}; static_cast<size_t>(gws0)};
#endif }
bool CLContext::IsArmMali() {
return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI;
} }
cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
......
...@@ -63,11 +63,14 @@ class CLContext { ...@@ -63,11 +63,14 @@ class CLContext {
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
size_t max_work_size, size_t max_work_size,
int divitor = 2); int divitor = 2);
// cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size); cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
bool IsArmMali();
private: private:
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_; std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
......
...@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt( ...@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
...@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple( ...@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
......
...@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, ...@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int offset, __private const int offset,
......
...@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, ...@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
__private const int item_h, __private const int item_h,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
...@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, ...@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
__private const int item_h, __private const int item_h,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
......
...@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, ...@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
......
...@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, ...@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
__private const int item_h, __private const int item_h,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
...@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, ...@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
__private const int item_h, __private const int item_h,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
...@@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, ...@@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
(int2)(out_w_base_id + out_w_id4, item_h_id), (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]); output[4]);
} }
} }
\ No newline at end of file
...@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, ...@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
......
...@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, ...@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
__private const int item_h, __private const int item_h,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
...@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, ...@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
__private const int item_h, __private const int item_h,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
...@@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, ...@@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
(int2)(out_w_base_id + out_w_id4, item_h_id), (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]); output[4]);
} }
} }
\ No newline at end of file
...@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0, ...@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t filter, __read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
......
...@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3( ...@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t filter, __read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int offset, __private const int offset,
...@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, ...@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
__private const int ou_nh, __private const int ou_nh,
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t filter, __read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
......
#include <cl_common.h>
__kernel void expend_c1(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.y = 0;
in.z = 0;
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c2(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.z = 0;
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c3(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c4(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_cn(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = out_c;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
\ No newline at end of file
...@@ -63,7 +63,10 @@ __kernel void grid_sampler(__read_only image2d_t input, ...@@ -63,7 +63,10 @@ __kernel void grid_sampler(__read_only image2d_t input,
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input3 = (CL_DTYPE4)(0.0); input3 = (CL_DTYPE4)(0.0);
} }
CL_DTYPE4 out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; CL_DTYPE4 out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) +
input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val);
// y // y
...@@ -97,7 +100,10 @@ __kernel void grid_sampler(__read_only image2d_t input, ...@@ -97,7 +100,10 @@ __kernel void grid_sampler(__read_only image2d_t input,
input3 = (CL_DTYPE4)(0.0); input3 = (CL_DTYPE4)(0.0);
} }
out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) +
input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val);
// z // z
...@@ -130,7 +136,10 @@ __kernel void grid_sampler(__read_only image2d_t input, ...@@ -130,7 +136,10 @@ __kernel void grid_sampler(__read_only image2d_t input,
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input3 = (CL_DTYPE4)(0.0); input3 = (CL_DTYPE4)(0.0);
} }
out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) +
input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val);
// w // w
...@@ -163,6 +172,9 @@ __kernel void grid_sampler(__read_only image2d_t input, ...@@ -163,6 +172,9 @@ __kernel void grid_sampler(__read_only image2d_t input,
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input3 = (CL_DTYPE4)(0.0); input3 = (CL_DTYPE4)(0.0);
} }
out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) +
input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val);
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void pixel_shuffle(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int in_N,
__private const int in_C,
__private const int in_H,
__private const int in_W,
__private const int out_N,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int upscale_factor) {
const int out_c4 = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int out_h = out_nh % out_H;
int out_n = out_nh / out_H;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int in_h = out_h / upscale_factor;
int in_w = out_w / upscale_factor;
int in_nh = out_n * in_H + in_h;
CL_DTYPE4 res;
int out_c;
int in_c;
CL_DTYPE4 in;
int2 in_pos;
out_c = out_c4 * 4 + 0;
in_c = out_c * upscale_factor * upscale_factor +
(out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
in_pos.x = (in_c / 4) * in_W + in_w;
in_pos.y = in_nh;
in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
if (in_c % 4 == 0) {
res.x = in.x;
} else if (in_c % 4 == 1) {
res.x = in.y;
} else if (in_c % 4 == 2) {
res.x = in.z;
} else if (in_c % 4 == 3) {
res.x = in.w;
}
out_c = out_c4 * 4 + 1;
in_c = out_c * upscale_factor * upscale_factor +
(out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
in_pos.x = (in_c / 4) * in_W + in_w;
in_pos.y = in_nh;
in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
if (in_c % 4 == 0) {
res.y = in.x;
} else if (in_c % 4 == 1) {
res.y = in.y;
} else if (in_c % 4 == 2) {
res.y = in.z;
} else if (in_c % 4 == 3) {
res.y = in.w;
}
out_c = out_c4 * 4 + 2;
in_c = out_c * upscale_factor * upscale_factor +
(out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
in_pos.x = (in_c / 4) * in_W + in_w;
in_pos.y = in_nh;
in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
if (in_c % 4 == 0) {
res.z = in.x;
} else if (in_c % 4 == 1) {
res.z = in.y;
} else if (in_c % 4 == 2) {
res.z = in.z;
} else if (in_c % 4 == 3) {
res.z = in.w;
}
out_c = out_c4 * 4 + 3;
in_c = out_c * upscale_factor * upscale_factor +
(out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
in_pos.x = (in_c / 4) * in_W + in_w;
in_pos.y = in_nh;
in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
if (in_c % 4 == 0) {
res.w = in.x;
} else if (in_c % 4 == 1) {
res.w = in.y;
} else if (in_c % 4 == 2) {
res.w = in.z;
} else if (in_c % 4 == 3) {
res.w = in.w;
}
int2 out_pos;
out_pos.x = out_c4 * out_W + out_w;
out_pos.y = out_nh;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, out_pos, res);
}
...@@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() { ...@@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() {
} }
return t_str; return t_str;
}; };
const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
LOG(INFO) << "device_version:" << device_version;
LOG(INFO) << "device_type:" << device_type_to_str(device_type); LOG(INFO) << "device_type:" << device_type_to_str(device_type);
device_info_["CL_DEVICE_TYPE"] = device_type; device_info_["CL_DEVICE_TYPE"] = device_type;
...@@ -317,6 +320,8 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() { ...@@ -317,6 +320,8 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
return device_info_; return device_info_;
} }
GpuType& CLRuntime::GetGpuType() { return gpu_type_; }
void CLRuntime::GetAdrenoContextProperties( void CLRuntime::GetAdrenoContextProperties(
std::vector<cl_context_properties>* properties, std::vector<cl_context_properties>* properties,
GPUPerfMode gpu_perf_mode, GPUPerfMode gpu_perf_mode,
...@@ -365,5 +370,26 @@ void CLRuntime::GetAdrenoContextProperties( ...@@ -365,5 +370,26 @@ void CLRuntime::GetAdrenoContextProperties(
properties->push_back(0); properties->push_back(0);
} }
double CLRuntime::GetCommandTime(const cl::Event& event) {
command_queue().finish();
auto start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
auto stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
return (stop_nanos - start_nanos) / 1000000.0;
}
double CLRuntime::GetQueuedTime(const cl::Event& event) {
command_queue().finish();
return (event.getProfilingInfo<CL_PROFILING_COMMAND_START>() -
event.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>()) /
1000000.0;
}
double CLRuntime::GetSubmitTime(const cl::Event& event) {
command_queue().finish();
return (event.getProfilingInfo<CL_PROFILING_COMMAND_START>() -
event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()) /
1000000.0;
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -93,6 +93,14 @@ class CLRuntime { ...@@ -93,6 +93,14 @@ class CLRuntime {
std::map<std::string, size_t>& GetDeviceInfo(); std::map<std::string, size_t>& GetDeviceInfo();
GpuType& GetGpuType();
double GetCommandTime(const cl::Event& event);
double GetQueuedTime(const cl::Event& event);
double GetSubmitTime(const cl::Event& event);
private: private:
CLRuntime() { Init(); } CLRuntime() { Init(); }
......
...@@ -45,5 +45,18 @@ const char* opencl_error_to_str(cl_int error); ...@@ -45,5 +45,18 @@ const char* opencl_error_to_str(cl_int error);
#else #else
#define CL_CHECK_FATAL(err_code__) #define CL_CHECK_FATAL(err_code__)
#endif #endif
#ifdef LITE_WITH_PROFILE
#define EnqueueNDRangeKernel( \
context, kernel, gws_offset, gws, lws, event_wait_list, event) \
context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \
kernel, gws_offset, gws, lws, event_wait_list, &event)
#else
#define EnqueueNDRangeKernel( \
context, kernel, gws_offset, gws, lws, event_wait_list, event) \
context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \
kernel, gws_offset, gws, lws, event_wait_list, nullptr)
#endif
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -62,6 +62,18 @@ class KernelBase { ...@@ -62,6 +62,18 @@ class KernelBase {
profiler_ = profiler; profiler_ = profiler;
profile_id_ = id; profile_id_ = id;
} }
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = std::string("NotImpl");
#ifdef LITE_WITH_ARM
ch->cl_event = event_;
#endif
}
virtual void SetIsKernelTest(bool is_kernel_test) {
is_kernel_test_ = is_kernel_test;
}
#endif #endif
void Launch() { void Launch() {
...@@ -86,14 +98,24 @@ class KernelBase { ...@@ -86,14 +98,24 @@ class KernelBase {
#if defined(LITE_WITH_MLU) #if defined(LITE_WITH_MLU)
WorkSpace::Global_MLU().AllocReset(); WorkSpace::Global_MLU().AllocReset();
#endif #endif
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); if (!is_kernel_test_) {
profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
}
Run(); Run();
#ifdef LITE_WITH_OPENCL
CLRuntime::Global()->command_queue().finish(); if (is_first_epoch_for_profiler_ && (!is_kernel_test_)) {
#endif SetProfileRuntimeKernelInfo(profiler_->GetOpCharacter(profile_id_));
profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); is_first_epoch_for_profiler_ = false;
}
if (!is_kernel_test_) {
profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
}
#else #else
Run(); Run();
#endif #endif
...@@ -185,6 +207,11 @@ class KernelBase { ...@@ -185,6 +207,11 @@ class KernelBase {
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
profile::Profiler* profiler_{nullptr}; profile::Profiler* profiler_{nullptr};
int profile_id_{-1}; int profile_id_{-1};
bool is_first_epoch_for_profiler_{true};
bool is_kernel_test_{true};
#ifdef LITE_WITH_OPENCL
cl::Event event_;
#endif
#endif #endif
}; };
......
...@@ -28,6 +28,7 @@ lite_cc_library(mir_passes ...@@ -28,6 +28,7 @@ lite_cc_library(mir_passes
elimination/identity_scale_eliminate_pass.cc elimination/identity_scale_eliminate_pass.cc
elimination/identity_dropout_eliminate_pass.cc elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc
elimination/remove_tf_redundant_ops_pass.cc
static_kernel_pick_pass.cc static_kernel_pick_pass.cc
variable_place_inference_pass.cc variable_place_inference_pass.cc
type_target_cast_pass.cc type_target_cast_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/elimination/remove_tf_redundant_ops_pass.h"
#include <unordered_set>
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher.h"
#include "lite/model_parser/cpp/var_desc.h"
namespace paddle {
namespace lite {
namespace mir {
void RemoveTFRedundantOpsPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
RemoveSqueeze2Reshape2Pattern(graph);
RemoveReshape2Pattern(graph);
}
void RemoveTFRedundantOpsPass::RemoveReshape2Pattern(
const std::unique_ptr<SSAGraph>& graph) {
bool found = false;
Node* softmax_node{nullptr};
Node* reshape2_node{nullptr};
std::string reshape2_out_arg_name;
Node* fetch_node{nullptr};
std::string fetch_in_arg_name;
DDim softmax_out_dims;
DDim reshape2_out_dims;
for (auto& op_node : graph->StmtTopologicalOrder()) {
if (op_node->AsStmt().picked_kernel().op_type() == "softmax") {
softmax_node = op_node;
} else if (op_node->AsStmt().picked_kernel().op_type() == "reshape2") {
reshape2_node = op_node;
} else if (op_node->AsStmt().picked_kernel().op_type() == "fetch") {
fetch_node = op_node;
fetch_in_arg_name = fetch_node->inlinks.front()->AsArg().name;
}
}
if (softmax_node == nullptr || reshape2_node == nullptr) {
return;
}
// Get out tensor dims of softmax, reshape2
auto* scope = softmax_node->AsStmt().op()->scope();
auto softmax_out_arg_name = softmax_node->outlinks.front()->AsArg().name;
auto softmax_out_tensor =
scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
softmax_out_dims = softmax_out_tensor.dims();
for (auto out_node : reshape2_node->outlinks) {
if (out_node->IsArg() && out_node->outlinks.size() != 0) {
reshape2_out_arg_name = reshape2_node->outlinks.front()->AsArg().name;
auto reshape2_out_tensor =
scope->FindVar(reshape2_out_arg_name)->Get<lite::Tensor>();
reshape2_out_dims = reshape2_out_tensor.dims();
}
}
VLOG(3) << "reshape2_out_dims:" << reshape2_out_dims;
VLOG(3) << "softmax_out_dims:" << softmax_out_dims;
VLOG(3) << "found:" << found;
if (softmax_out_dims == reshape2_out_dims &&
softmax_node->outlinks.front() == reshape2_node->inlinks.front() &&
reshape2_out_arg_name == fetch_in_arg_name) {
found = true;
}
if (found) {
// link out_arg to op
IR_NODE_LINK_TO(softmax_node->outlinks.front(), fetch_node);
// collect nodes to safe remove
std::unordered_set<const Node*> nodes_to_remove;
auto remove_inst_node_and_out_args_node = [&](Node* n) {
nodes_to_remove.insert(n);
for (auto& out : n->outlinks) {
nodes_to_remove.insert(out);
}
};
remove_inst_node_and_out_args_node(reshape2_node);
GraphSafeRemoveNodes(graph.get(), nodes_to_remove);
auto fetch_op_desc = fetch_node->AsStmt().mutable_op_info();
fetch_op_desc->SetInput("X",
{softmax_node->outlinks.front()->AsArg().name});
}
VLOG(5) << "\n" << Visualize(graph.get());
}
void RemoveTFRedundantOpsPass::RemoveSqueeze2Reshape2Pattern(
const std::unique_ptr<SSAGraph>& graph) {
VLOG(5) << Visualize(graph.get());
bool found = false;
// find out_arg->squeeze2
// find out_arg_dims of out_arg
Node* out_arg_node{nullptr};
DDim out_arg_dims;
Node* squeeze2_node{nullptr};
// find squeeze2->reshape2
// find output dims of squeeze2 and reshape2 nodes
DDim squeeze2_out_dims;
Node* reshape2_node{nullptr};
Node* reshape2_out_node{nullptr};
DDim reshape2_out_dims;
// find next inst node of reshape2
Node* next_inst_node_of_reshape2_out{nullptr};
for (auto& node : graph->StmtTopologicalOrder()) {
if (node->AsStmt().picked_kernel().op_type() != "squeeze2") continue;
auto* scope = node->AsStmt().op()->scope();
// find inlinks of squeeze2: out_arg_node
squeeze2_node = node;
auto squeeze2_inlinks = squeeze2_node->inlinks;
VLOG(5) << "squeeze2_inlinks.size():" << squeeze2_inlinks.size();
for (auto& in_link : squeeze2_inlinks) {
if (in_link->IsArg() && squeeze2_inlinks.size() == 1) {
out_arg_node = in_link;
auto* var = scope->FindVar(out_arg_node->AsArg().name);
out_arg_dims = var->Get<lite::Tensor>().dims();
VLOG(5) << "arg name:" << out_arg_node->AsArg().name
<< " dims:" << out_arg_dims;
} else {
// found mutli-input links
continue;
}
}
// find squeeze2->reshape2 pattern
// and output dims of squeeze2, reshape2 nodes
auto squeeze2_outlinks = squeeze2_node->outlinks;
for (auto& squeeze2_out_link : squeeze2_outlinks) {
if (squeeze2_out_link->IsArg() &&
squeeze2_out_link->outlinks.size() != 0) {
auto* squeeze2_out_var =
scope->FindVar(squeeze2_out_link->AsArg().name);
squeeze2_out_dims = squeeze2_out_var->Get<lite::Tensor>().dims();
VLOG(5) << "squeeze2_out_arg.name:" << squeeze2_out_link->AsArg().name
<< " squeeze2_out_dims:" << squeeze2_out_dims
<< " squeeze2_out_link->outlinks.size():"
<< squeeze2_out_link->outlinks.size();
for (auto& out2_link : squeeze2_out_link->outlinks) {
if (out2_link->IsStmt() &&
out2_link->AsStmt().picked_kernel().op_type() == "reshape2") {
reshape2_node = out2_link;
for (auto& reshape2_out_link : reshape2_node->outlinks) {
if (reshape2_out_link->IsArg() &&
reshape2_out_link->outlinks.size() != 0) {
reshape2_out_node = reshape2_out_link;
auto* reshape2_out_var =
scope->FindVar(reshape2_out_link->AsArg().name);
reshape2_out_dims =
reshape2_out_var->Get<lite::Tensor>().dims();
VLOG(5) << "reshape2_out_node:" << reshape2_out_node
<< " reshape2_out_name:"
<< reshape2_out_link->AsArg().name
<< " reshape2_out_dims:" << reshape2_out_dims;
}
}
}
}
}
}
// find next inst node of reshape2
VLOG(5) << "reshape2_out_node->outlinks.size():"
<< reshape2_out_node->outlinks.size()
<< " reshape2_out_node->IsStmt():" << reshape2_out_node->IsStmt();
VLOG(5) << "reshape2_out_node->AsArg().name:"
<< reshape2_out_node->AsArg().name;
if (reshape2_out_node != nullptr &&
reshape2_out_node->outlinks.size() == 1 &&
reshape2_out_node->outlinks.front()->IsStmt()) {
next_inst_node_of_reshape2_out = reshape2_out_node->outlinks.front();
found = true;
break;
VLOG(5)
<< "next_inst_node_of_reshape2_out->picked_kernel().op_type():"
<< next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type();
}
VLOG(5) << "==============================";
VLOG(5) << "out_arg_dims:" << out_arg_dims;
VLOG(5) << "squeeze2_out_dims:" << squeeze2_out_dims;
VLOG(5) << "reshape2_out_dims:" << reshape2_out_dims;
VLOG(5) << "==============================";
}
// replace pattern
if (found && out_arg_dims[1] == squeeze2_out_dims[1] &&
out_arg_dims[1] == reshape2_out_dims[1] && out_arg_dims[1] == 1001 &&
out_arg_dims[2] == out_arg_dims[3] && out_arg_dims[2] == 1 &&
next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type() ==
"softmax") {
// link out_arg to op
IR_NODE_LINK_TO(out_arg_node, next_inst_node_of_reshape2_out);
// collect nodes to safe remove
std::unordered_set<const Node*> nodes_to_remove;
auto remove_inst_node_and_out_args_node = [&](Node* n) {
nodes_to_remove.insert(n);
for (auto& out : n->outlinks) {
nodes_to_remove.insert(out);
}
};
remove_inst_node_and_out_args_node(squeeze2_node);
remove_inst_node_and_out_args_node(reshape2_node);
GraphSafeRemoveNodes(graph.get(), nodes_to_remove);
auto next_inst_op_desc =
next_inst_node_of_reshape2_out->AsStmt().mutable_op_info();
next_inst_op_desc->SetInput("X", {out_arg_node->AsArg().name});
VLOG(5) << Visualize(graph.get());
}
VLOG(5) << "replace pattern fininshed";
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(remove_tf_redundant_ops_pass,
paddle::lite::mir::RemoveTFRedundantOpsPass)
.BindTargets({TARGET(kOpenCL), TARGET(kARM)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/tensor.h"
#include "lite/core/types.h"
namespace paddle {
namespace lite {
namespace mir {
/*
* mir::RemoveTFRedundantOpsPass remove reshape2->squeeze2 pattern
* and last reshape2 op for tensorflow mobilenetv1/v2.
*/
class RemoveTFRedundantOpsPass : public mir::StmtPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
void RemoveReshape2Pattern(const std::unique_ptr<SSAGraph>& graph);
void RemoveSqueeze2Reshape2Pattern(const std::unique_ptr<SSAGraph>& graph);
};
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <memory> #include <memory>
#include <numeric> #include <numeric>
#include <set>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
......
...@@ -170,6 +170,9 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -170,6 +170,9 @@ class VariablePlaceInferencePass : public DebugPass {
// If is quantization, infer the Int8 type. // If is quantization, infer the Int8 type.
if (type->precision() == PRECISION(kInt8)) { if (type->precision() == PRECISION(kInt8)) {
x_out->AsArg().type = type; x_out->AsArg().type = type;
} else if (type->precision() == PRECISION(kFP16) &&
type->target() != TARGET(kOpenCL)) {
x_out->AsArg().type = type;
} else { } else {
PrecisionType tmp_ptype = x_out->AsArg().type->precision(); PrecisionType tmp_ptype = x_out->AsArg().type->precision();
x_out->AsArg().type = LiteType::GetTensorTy( x_out->AsArg().type = LiteType::GetTensorTy(
......
...@@ -73,6 +73,9 @@ class OpLite : public Registry { ...@@ -73,6 +73,9 @@ class OpLite : public Registry {
// Indicate whether the Op runs only once or not // Indicate whether the Op runs only once or not
virtual bool run_once() const { return false; } virtual bool run_once() const { return false; }
std::string Type() { return op_type_; } std::string Type() { return op_type_; }
#ifdef LITE_WITH_PROFILE
virtual void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {}
#endif
// Link the external execution environ to internal context. // Link the external execution environ to internal context.
bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
......
...@@ -92,7 +92,8 @@ class Optimizer { ...@@ -92,7 +92,8 @@ class Optimizer {
"bm_subgraph_pass", "bm_subgraph_pass",
"apu_subgraph_pass", "apu_subgraph_pass",
"rknpu_subgraph_pass", "rknpu_subgraph_pass",
"static_kernel_pick_pass", // pick original kernel from graph "static_kernel_pick_pass", // pick original kernel from graph
"remove_tf_redundant_ops_pass",
"variable_place_inference_pass", // inference arg/var's "variable_place_inference_pass", // inference arg/var's
// info(target/precision/layout/device) // info(target/precision/layout/device)
// using kernel info // using kernel info
...@@ -158,6 +159,55 @@ class Optimizer { ...@@ -158,6 +159,55 @@ class Optimizer {
const lite::Scope* exec_scope() const { return exec_scope_; } const lite::Scope* exec_scope() const { return exec_scope_; }
// Set shape(dims) infos of var descs to scope var.
// developer can write pass using input / output tensor dims of op.
//
// Example: If you have node `Node* softmax_node`,
// you can get dims of output tensor in passes:
//
// auto* scope = softmax_node->AsStmt().op()->scope();
// auto softmax_out_arg_name =
// softmax_node->outlinks.front()->AsArg().name;
// auto softmax_out_tensor =
// scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
// softmax_out_dims = softmax_out_tensor.dims();
void SetVarDescShapeToScopeVar() {
auto dims_to_str_func = [](std::vector<int64_t> shape) -> std::string {
std::string str_res;
for (size_t i = 0; i < shape.size(); ++i) {
str_res += std::to_string(shape[i]);
if (i != shape.size() - 1) {
str_res += "x";
}
}
return str_res;
};
auto* program_desc = program_->program_desc();
VLOG(5) << "program_desc->BlocksSize():" << program_desc->BlocksSize();
auto blocks_desc = program_desc->GetBlocks();
for (size_t bidx = 0; bidx < blocks_desc.size(); ++bidx) {
auto block_desc = blocks_desc[bidx];
auto vars_desc = block_desc.GetVars();
for (size_t vidx = 0; vidx < vars_desc.size(); ++vidx) {
auto var_desc = vars_desc[vidx];
VLOG(5) << var_desc.Name() << " "
<< dims_to_str_func(var_desc.GetShape());
if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
auto* var = program_->exec_scope()->FindVar(var_desc.Name());
auto tensor = var->GetMutable<lite::Tensor>();
if (tensor->dims().size() == 0 && var_desc.GetShape().size() != 0) {
VLOG(5) << "var_desc.Name():" << var_desc.Name()
<< " shape:" << dims_to_str_func(var_desc.GetShape());
tensor->Resize(var_desc.GetShape());
}
VLOG(5) << "var_desc.Name():" << var_desc.Name()
<< " shape:" << dims_to_str_func(var_desc.GetShape())
<< " tensor:" << tensor->dims();
}
}
}
// Generate a new program based on the mir graph. // Generate a new program based on the mir graph.
std::unique_ptr<RuntimeProgram> GenRuntimeProgram() { std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>( auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
...@@ -198,6 +248,7 @@ class Optimizer { ...@@ -198,6 +248,7 @@ class Optimizer {
// Specify the passes and run them. // Specify the passes and run them.
void RunPasses(const std::vector<std::string>& passes) { void RunPasses(const std::vector<std::string>& passes) {
SetVarDescShapeToScopeVar();
for (auto& x : passes) { for (auto& x : passes) {
LOG(INFO) << "== Running pass: " << x; LOG(INFO) << "== Running pass: " << x;
mir::Pass* pass = mir::PassManager::Global().LookUp(x); mir::Pass* pass = mir::PassManager::Global().LookUp(x);
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "lite/core/profile/profiler.h" #include "lite/core/profile/profiler.h"
#include <iomanip>
#include <map> #include <map>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -23,10 +24,11 @@ namespace profile { ...@@ -23,10 +24,11 @@ namespace profile {
namespace { namespace {
auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) { auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
return (c1.target < c2.target) || (c1.op_type < c2.op_type) || // compare for unique key of map
(c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark); return (c1.kernel_name + c1.kernel_func_name <
c2.kernel_name + c2.kernel_func_name);
}; };
} } // namespace
std::map<Type, std::string> TypeStr{ std::map<Type, std::string> TypeStr{
{Type::kUnk, "Unknown"}, {Type::kUnk, "Unknown"},
...@@ -64,22 +66,62 @@ int Profiler::NewTimer(const OpCharacter& ch) { ...@@ -64,22 +66,62 @@ int Profiler::NewTimer(const OpCharacter& ch) {
return units_.size() - 1; return units_.size() - 1;
} }
OpCharacter* Profiler::GetOpCharacter(const size_t index) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
return &units_[index].Character();
}
void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) { void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size()) CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range."; << "The timer index in the profiler is out of range.";
units_[index].Timer(type)->Start(ctx); units_[index].Timer(type)->Start(ctx);
} }
float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { void Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size()) CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range."; << "The timer index in the profiler is out of range.";
return units_[index].Timer(type)->Stop(ctx); units_[index].Timer(type)->Stop(ctx);
#ifdef LITE_WITH_OPENCL
units_[index].Timer(type)->CLStop(units_[index].character.op_type,
units_[index].character.io_duration,
units_[index].character.cl_event);
#endif
}
int Profiler::GetKernelFuncCalledTimes(const std::string& op_type,
const std::string& kernel_attr,
const std::string& kernel_func_name) {
int count = 0;
for (size_t i = 0; i < units_.size(); ++i) {
if ((units_[i].character.kernel_func_name == kernel_func_name) &&
(units_[i].character.kernel_attr == kernel_attr) &&
(units_[i].character.op_type == op_type)) {
++count;
}
}
return count;
}
float Profiler::GetKernelFuncSummaryGOPs(const std::string& op_type,
const std::string& kernel_attr,
const std::string& kernel_func_name) {
float GOPs = 0;
for (size_t i = 0; i < units_.size(); ++i) {
if ((units_[i].character.kernel_func_name == kernel_func_name) &&
(units_[i].character.kernel_attr == kernel_attr) &&
(units_[i].character.op_type == op_type)) {
GOPs += units_[i].character.macs;
}
}
return GOPs * 1e-9f;
} }
std::string Profiler::Summary(Type type, bool concise, size_t w) { std::string Profiler::Summary(Type type, bool concise, size_t w) {
using std::setw; using std::setw;
using std::left; using std::left;
using std::fixed; using std::fixed;
using std::setprecision;
STL::stringstream ss; STL::stringstream ss;
std::string title; std::string title;
// Title. // Title.
...@@ -94,14 +136,41 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { ...@@ -94,14 +136,41 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
<< " Profiler Summary: " << name_ << ", Exclude " << w << " Profiler Summary: " << name_ << ", Exclude " << w
<< " warm-ups =====" << std::endl; << " warm-ups =====" << std::endl;
} }
ss << setw(25) << left << "Operator Type" ss << setw(20) << left << "OperatorType"
<< " " << setw(40) << left << "Kernel Name" << " " << setw(30) << left << "KerneAttr(Place)"
<< " " << setw(12) << left << "Remark" << " " << setw(24) << left << "KernelFuncName";
<< " " << setw(12) << left << "Avg (ms)" if (!concise) {
<< " " << setw(12) << left << "Min (ms)" ss << " " << setw(26) << left << "Remark"
<< " " << setw(12) << left << "Max (ms)" << " " << setw(15) << left << "InDim"
<< " " << setw(12) << left << "Last (ms)" << " " << setw(15) << left << "FilterDim"
<< " " << setw(12) << left << "Percent (%)" << std::endl; << " " << setw(15) << left << "OutDim";
}
ss << " " << setw(7) << left << "Avg(ms)"
<< " " << setw(7) << left << "Min(ms)"
<< " " << setw(7) << left << "Max(ms)";
if (!concise) {
ss << " " << setw(7) << left << "Last(ms)";
}
ss << " " << setw(7) << left << "Avg(%)"
<< " " << setw(7) << left << "GOPs";
if (!concise) {
ss << " " << setw(7) << left << "GOPS";
}
if (concise) {
ss << " " << setw(11) << left << "CalledTimes";
}
#ifdef LITE_WITH_OPENCL
ss << " " << setw(9) << left << "clAvg(ms)"
<< " " << setw(9) << left << "clMin(ms)"
<< " " << setw(9) << left << "clMax(ms)"
<< " " << setw(9) << left << "clAvg(%)";
if (!concise) {
ss << " " << setw(12) << left << "GlobalWorkSize"
<< " " << setw(12) << left << "LocalWorkSize";
}
#endif
ss << std::endl;
// Profile information. // Profile information.
if (concise) { if (concise) {
std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp); std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
...@@ -111,32 +180,75 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { ...@@ -111,32 +180,75 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
ch->second.avg += unit.Timer(type)->LapTimes().Avg(w); ch->second.avg += unit.Timer(type)->LapTimes().Avg(w);
ch->second.min += unit.Timer(type)->LapTimes().Min(w); ch->second.min += unit.Timer(type)->LapTimes().Min(w);
ch->second.max += unit.Timer(type)->LapTimes().Max(w); ch->second.max += unit.Timer(type)->LapTimes().Max(w);
#ifdef LITE_WITH_OPENCL
ch->second.cl_avg += unit.Timer(type)->CLLapTimes().Avg(w);
ch->second.cl_min += unit.Timer(type)->CLLapTimes().Min(w);
ch->second.cl_max += unit.Timer(type)->CLLapTimes().Max(w);
#endif
} else { } else {
TimeInfo info({unit.Timer(type)->LapTimes().Avg(w), TimeInfo info;
unit.Timer(type)->LapTimes().Min(w), info.avg = unit.Timer(type)->LapTimes().Avg(w);
unit.Timer(type)->LapTimes().Max(w)}); info.min = unit.Timer(type)->LapTimes().Min(w);
info.max = unit.Timer(type)->LapTimes().Max(w);
#ifdef LITE_WITH_OPENCL
info.cl_avg = unit.Timer(type)->CLLapTimes().Avg(w);
info.cl_min = unit.Timer(type)->CLLapTimes().Min(w);
info.cl_max = unit.Timer(type)->CLLapTimes().Max(w);
#endif
summary.insert({unit.Character(), info}); summary.insert({unit.Character(), info});
} }
} }
// compute total time // compute total time
float total = 0.0; float total = 0.0;
for (const auto& item : summary) { for (const auto& item : summary) {
total += item.second.avg; total += item.second.avg;
} }
#ifdef LITE_WITH_OPENCL
float cl_total = 0.0;
for (const auto& item : summary) {
cl_total += item.second.cl_avg;
}
#endif
for (const auto& item : summary) { for (const auto& item : summary) {
float percent = 0; float percent = 0;
if (total > 0) { if (total > 0) {
percent = 100 * (item.second.avg / total); percent = 100 * (item.second.avg / total);
} }
// clang-format off // clang-format off
ss << setw(25) << left << fixed << item.first.op_type \ ss << setw(20) << left << fixed << item.first.op_type
<< " " << setw(40) << left << fixed << item.first.kernel_name \ << " " << setw(30) << left << fixed << item.first.kernel_attr
<< " " << setw(12) << left << fixed << item.first.remark \ << " " << setw(24) << left << fixed << item.first.kernel_func_name
<< " " << setw(12) << left << fixed << item.second.avg \ << " " << setw(7) << left << fixed << setprecision(3)
<< " " << setw(12) << left << fixed << item.second.min \ << item.second.avg
<< " " << setw(12) << left << fixed << item.second.max \ << " " << setw(7) << left << fixed << setprecision(3)
<< " " << setw(12) << left << fixed << percent << "%" \ << item.second.min
<< " " << std::endl; << " " << setw(7) << left << fixed << setprecision(3)
<< item.second.max
<< " " << setprecision(2) << percent << "% "
<< " " << setw(7) << left << fixed << setprecision(3)
<< GetKernelFuncSummaryGOPs(item.first.op_type,
item.first.kernel_attr,
item.first.kernel_func_name)
<< " " << setw(11) << left << fixed
<< GetKernelFuncCalledTimes(item.first.op_type,
item.first.kernel_attr,
item.first.kernel_func_name);
#ifdef LITE_WITH_OPENCL
float cl_percent = 0;
if (cl_total > 0) {
cl_percent = 100 * (item.second.cl_avg / cl_total);
}
ss << " " << setw(9) << left << fixed << setprecision(3)
<< item.second.cl_avg
<< " " << setw(9) << left << fixed << setprecision(3)
<< item.second.cl_min
<< " " << setw(9) << left << fixed << setprecision(3)
<< item.second.cl_max
<< " " << left << fixed << setprecision(2) << cl_percent << "% ";
#endif
ss << std::endl;
// clang-format on // clang-format on
} }
} else { } else {
...@@ -145,6 +257,13 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { ...@@ -145,6 +257,13 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
const auto& times = unit.Timer(type)->LapTimes(); const auto& times = unit.Timer(type)->LapTimes();
total += times.Avg(w); total += times.Avg(w);
} }
#ifdef LITE_WITH_OPENCL
float cl_total = 0.0;
for (auto& unit : units_) {
const auto& cl_times = unit.Timer(type)->CLLapTimes();
cl_total += cl_times.Avg(w);
}
#endif
for (auto& unit : units_) { for (auto& unit : units_) {
const auto& times = unit.Timer(type)->LapTimes(); const auto& times = unit.Timer(type)->LapTimes();
float run = times.Avg(w); float run = times.Avg(w);
...@@ -152,17 +271,46 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { ...@@ -152,17 +271,46 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
if (total > 0) { if (total > 0) {
percent = 100 * (run / total); percent = 100 * (run / total);
} }
#ifdef LITE_WITH_OPENCL
const auto& cl_times = unit.Timer(type)->CLLapTimes();
float cl_run = cl_times.Avg(w);
float cl_percent = 0;
if (cl_total > 0) {
cl_percent = 100 * (cl_run / cl_total);
}
#endif
// clang-format off // clang-format off
ss << setw(25) << left << fixed << unit.Character().op_type \ ss << setw(20) << left << fixed << unit.Character().op_type
<< " " << setw(40) << left << fixed << unit.Character().kernel_name \ << " " << setw(30) << left << fixed << unit.Character().kernel_attr
<< " " << setw(12) << left << fixed << unit.Character().remark \ << " " << setw(24) << left << fixed
<< " " << setw(12) << left << fixed << times.Avg(w) \ << unit.Character().kernel_func_name
<< " " << setw(12) << left << fixed << times.Min(w) \ << " " << setw(26) << left << fixed << unit.Character().remark
<< " " << setw(12) << left << fixed << times.Max(w) \ << " " << setw(15) << left << fixed << unit.Character().input_shape
<< " " << setw(12) << left << fixed << times.Last(w) \ << " " << setw(15) << left << fixed << unit.Character().filter_shape
<< " " << setw(12) << left << fixed << percent << "%" \ << " " << setw(15) << left << fixed << unit.Character().output_shape
<< std::endl; << " " << setw(7) << left << fixed << setprecision(3) << times.Avg(w)
// clang-format on << " " << setw(7) << left << fixed << setprecision(3) << times.Min(w)
<< " " << setw(7) << left << fixed << setprecision(3) << times.Max(w)
<< " " << setw(7) << left << fixed << setprecision(3) << times.Last(w)
<< " " << left << setprecision(2) << percent << "% "
<< " " << setw(7) << left << fixed << setprecision(3)
<< 1e-9f * unit.Character().macs
<< " " << setw(7) << left << fixed << setprecision(2)
<< 1e-6f * unit.Character().macs / times.Avg(w);
// clang-format on
#ifdef LITE_WITH_OPENCL
ss << " " << setw(9) << left << fixed << setprecision(3)
<< cl_times.Avg(w) << " " << setw(9) << left << fixed
<< setprecision(3) << cl_times.Min(w) << " " << setw(9) << left
<< fixed << setprecision(3) << cl_times.Max(w) << " " << left
<< setprecision(2) << cl_percent << "% "
<< " " << setw(12) << left << fixed
<< unit.Character().global_work_size << " " << setw(12) << left
<< fixed << unit.Character().local_work_size;
#endif
ss << std::endl;
} }
} }
return ss.str(); return ss.str();
......
...@@ -18,6 +18,10 @@ ...@@ -18,6 +18,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/core/profile/timer.h" #include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_include.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -35,25 +39,83 @@ struct TimeInfo { ...@@ -35,25 +39,83 @@ struct TimeInfo {
float avg; float avg;
float min; float min;
float max; float max;
#ifdef LITE_WITH_OPENCL
float cl_avg;
float cl_min;
float cl_max;
#endif
}; };
struct OpCharacter { struct OpCharacter {
TargetType target; TargetType target;
void* op_lite{nullptr};
std::string op_type{std::string("N/A")}; std::string op_type{std::string("N/A")};
std::string kernel_name{std::string("N/A")}; std::string kernel_name{std::string("N/A")};
std::string kernel_attr{std::string("N/A")};
std::string kernel_func_name{std::string("N/A")};
std::string remark{std::string("N/A")}; std::string remark{std::string("N/A")};
std::string input_shape{"N/A"};
std::string output_shape{"N/A"};
std::string filter_shape{"N/A"};
float macs{0};
float macs_ps{0};
float io_duration{0};
#ifdef LITE_WITH_OPENCL
cl::Event cl_event{};
std::string global_work_size{"N/A"};
std::string local_work_size{"N/A"};
std::string NDRangeToStr(const cl::NDRange& range) {
std::string range_str{""};
const size_t range_size = 3;
for (size_t i = 0; i < range_size /*range.size()*/; ++i) {
LOG(INFO) << "range[" << i << "]:" << std::to_string(range[i]);
range_str += std::to_string(range[i]);
if (i != range_size - 1) {
range_str += ",";
}
}
return range_str;
}
#else
void* cl_event{nullptr};
#endif
std::string DimToStr(const paddle::lite::DDimLite& dim) {
if (!dim.size()) return "NotImpl";
std::string dim_str{""};
for (size_t i = 0; i < dim.size(); ++i) {
dim_str += std::to_string(dim[i]);
if (i != dim.size() - 1) {
dim_str += "x";
}
}
return dim_str;
}
std::string str() {
std::string str{""};
str += kernel_name + "/" + kernel_func_name + "/" + remark + "/" +
input_shape + "/" + filter_shape + "/" + output_shape;
return str;
}
}; };
class StatisUnit final { class StatisUnit final {
public: public:
explicit StatisUnit(const OpCharacter& ch); explicit StatisUnit(const OpCharacter& ch);
lite::profile::Timer* Timer(Type type); lite::profile::Timer* Timer(Type type);
const OpCharacter& Character() const { return character; } OpCharacter& Character() { return character; }
OpCharacter character;
protected: protected:
std::unique_ptr<lite::profile::Timer> create_t; std::unique_ptr<lite::profile::Timer> create_t;
std::unique_ptr<lite::profile::Timer> dispatch_t; std::unique_ptr<lite::profile::Timer> dispatch_t;
OpCharacter character;
}; };
class Profiler final { class Profiler final {
...@@ -62,8 +124,15 @@ class Profiler final { ...@@ -62,8 +124,15 @@ class Profiler final {
explicit Profiler(const std::string& name) : name_(name) {} explicit Profiler(const std::string& name) : name_(name) {}
int NewTimer(const OpCharacter& ch); int NewTimer(const OpCharacter& ch);
void StartTiming(Type type, const int index, KernelContext* ctx); void StartTiming(Type type, const int index, KernelContext* ctx);
float StopTiming(Type type, const int index, KernelContext* ctx); void StopTiming(Type type, const int index, KernelContext* ctx);
std::string Summary(Type type, bool concise = true, size_t warm_up = 10); std::string Summary(Type type, bool concise = true, size_t warm_up = 10);
int GetKernelFuncCalledTimes(const std::string& op_type,
const std::string& kernel_attr,
const std::string& kernel_func_name);
float GetKernelFuncSummaryGOPs(const std::string& op_type,
const std::string& kernel_attr,
const std::string& kernel_func_name);
OpCharacter* GetOpCharacter(const size_t index);
private: private:
std::string name_{std::string("N/A")}; std::string name_{std::string("N/A")};
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <string>
#include <vector> #include <vector>
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/cuda_utils.h" #include "lite/backends/cuda/cuda_utils.h"
...@@ -87,6 +88,22 @@ class Timer { ...@@ -87,6 +88,22 @@ class Timer {
this->laps_t_.Add(elapse_ms); this->laps_t_.Add(elapse_ms);
return elapse_ms; return elapse_ms;
} }
#ifdef LITE_WITH_OPENCL
float CLStop(const std::string& op_type, float io_duration, cl::Event event) {
float cl_kernel_elapse_ms = 0.0;
if (op_type != "io_copy") {
cl_kernel_elapse_ms =
CLRuntime::Global()->CLRuntime::GetCommandTime(event);
} else {
cl_kernel_elapse_ms = io_duration;
}
this->cl_laps_t_.Add(cl_kernel_elapse_ms);
return cl_kernel_elapse_ms;
}
const TimeList<float>& CLLapTimes() const { return cl_laps_t_; }
#endif
virtual void Start(KernelContext* ctx) { return Start(); } virtual void Start(KernelContext* ctx) { return Start(); }
virtual float Stop(KernelContext* ctx) { return Stop(); } virtual float Stop(KernelContext* ctx) { return Stop(); }
float AvgLapTimeMs() const { return laps_t_.Avg(); } float AvgLapTimeMs() const { return laps_t_.Avg(); }
...@@ -94,6 +111,9 @@ class Timer { ...@@ -94,6 +111,9 @@ class Timer {
protected: protected:
TimeList<float> laps_t_; TimeList<float> laps_t_;
#ifdef LITE_WITH_OPENCL
TimeList<float> cl_laps_t_;
#endif
private: private:
std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_; std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
......
...@@ -169,7 +169,7 @@ void RuntimeProgram::Run() { ...@@ -169,7 +169,7 @@ void RuntimeProgram::Run() {
#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PRECISION_PROFILE
} }
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
#endif #endif
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary; LOG(INFO) << "\n" << precision_profiler_summary;
...@@ -299,6 +299,14 @@ void Instruction::Run() { ...@@ -299,6 +299,14 @@ void Instruction::Run() {
op_->InferShape(); op_->InferShape();
kernel_->Launch(); kernel_->Launch();
has_run_ = true; has_run_ = true;
#ifdef LITE_WITH_PROFILE
if (first_epoch_for_profiler_) {
kernel_->SetIsKernelTest(false);
SetProfileRuntimeOpInfo(profiler_->GetOpCharacter(profile_id_));
first_epoch_for_profiler_ = false;
}
#endif
} }
STL::ostream& operator<<(STL::ostream& os, const Instruction& other) { STL::ostream& operator<<(STL::ostream& os, const Instruction& other) {
......
...@@ -23,6 +23,9 @@ ...@@ -23,6 +23,9 @@
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/model_parser/cpp/program_desc.h" #include "lite/model_parser/cpp/program_desc.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -64,6 +67,8 @@ struct Program { ...@@ -64,6 +67,8 @@ struct Program {
lite::Scope* exec_scope() { return exec_scope_; } lite::Scope* exec_scope() { return exec_scope_; }
lite::Scope* scope() { return scope_.get(); } lite::Scope* scope() { return scope_.get(); }
cpp::ProgramDesc* program_desc() { return &desc_; }
const std::unordered_map<std::string, PrecisionType>& var_data_type() const { const std::unordered_map<std::string, PrecisionType>& var_data_type() const {
return var_data_type_; return var_data_type_;
} }
...@@ -125,13 +130,22 @@ struct Instruction { ...@@ -125,13 +130,22 @@ struct Instruction {
profiler_ = profiler; profiler_ = profiler;
if (op_->Type() != "feed" && op_->Type() != "fetch") { if (op_->Type() != "feed" && op_->Type() != "fetch") {
profile::OpCharacter ch; profile::OpCharacter ch;
ch.op_lite = static_cast<void*>(const_cast<paddle::lite::OpLite*>(op()));
ch.target = kernel()->target(); ch.target = kernel()->target();
ch.op_type = op_->Type(); ch.op_type = op_->Type();
ch.kernel_name = kernel()->name(); ch.kernel_name = kernel()->name();
ch.kernel_attr = kernel()->name().substr(ch.op_type.size() + 1,
kernel()->name().size());
// append `ch.kernel_func_name` in StopTiming
profile_id_ = profiler->NewTimer(ch); profile_id_ = profiler->NewTimer(ch);
kernel_->SetProfiler(profiler_, profile_id_); kernel_->SetProfiler(profiler_, profile_id_);
} }
} }
void SetProfileRuntimeOpInfo(paddle::lite::profile::OpCharacter* ch) {
auto* op_lite = static_cast<paddle::lite::OpLite*>(ch->op_lite);
op_lite->GetOpRuntimeInfo(ch);
}
#endif #endif
private: private:
...@@ -144,6 +158,7 @@ struct Instruction { ...@@ -144,6 +158,7 @@ struct Instruction {
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
profile::Profiler* profiler_; profile::Profiler* profiler_;
int profile_id_{-1}; int profile_id_{-1};
bool first_epoch_for_profiler_{true};
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
}; };
......
...@@ -35,6 +35,9 @@ void ArgmaxCompute::Run() { ...@@ -35,6 +35,9 @@ void ArgmaxCompute::Run() {
} }
lite::arm::math::argmax_func(input, axis, output); lite::arm::math::argmax_func(input, axis, output);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "argmax_func";
#endif
return; return;
} }
......
...@@ -16,6 +16,10 @@ ...@@ -16,6 +16,10 @@
#include <algorithm> #include <algorithm>
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/operators/argmax_op.h" #include "lite/operators/argmax_op.h"
#ifdef LITE_WITH_PROFILE
#include <string>
#include "lite/core/profile/profiler.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -29,6 +33,14 @@ class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> { ...@@ -29,6 +33,14 @@ class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
void Run() override; void Run() override;
virtual ~ArgmaxCompute() = default; virtual ~ArgmaxCompute() = default;
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
std::string kernel_func_name_{"NotImplForArgmax"};
#endif
}; };
} // namespace arm } // namespace arm
......
...@@ -33,6 +33,17 @@ void CalibComputeFp32ToInt8<DLType>::Run() { ...@@ -33,6 +33,17 @@ void CalibComputeFp32ToInt8<DLType>::Run() {
din, dout, scale.data(), 1, 1, param.input->numel()); din, dout, scale.data(), 1, 1, param.input->numel());
} }
template <DataLayoutType DLType>
void CalibComputeInt64ToInt32<DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<int64_t>();
std::vector<float> scale = {param.scale};
auto* dout = param.output->template mutable_data<int32_t>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = din[i];
}
}
template <DataLayoutType DLType> template <DataLayoutType DLType>
void CalibComputeInt8ToFp32<DLType>::Run() { void CalibComputeInt8ToFp32<DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>(); auto& param = this->template Param<operators::CalibParam>();
...@@ -105,6 +116,23 @@ REGISTER_LITE_KERNEL( ...@@ -105,6 +116,23 @@ REGISTER_LITE_KERNEL(
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(
calib,
kARM,
kInt64,
kNCHW,
paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
int64_to_int32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kInt64),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
calib_once, calib_once,
kARM, kARM,
...@@ -161,3 +189,20 @@ REGISTER_LITE_KERNEL( ...@@ -161,3 +189,20 @@ REGISTER_LITE_KERNEL(
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(
calib_once,
kARM,
kInt64,
kNCHW,
paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
int64_to_int32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kInt64),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
...@@ -34,6 +34,19 @@ class CalibComputeFp32ToInt8 ...@@ -34,6 +34,19 @@ class CalibComputeFp32ToInt8
private: private:
}; };
template <DataLayoutType DLType>
class CalibComputeInt64ToInt32
: public KernelLite<TARGET(kARM), PRECISION(kInt64), DLType> {
public:
using param_t = operators::CalibParam;
void Run() override;
~CalibComputeInt64ToInt32() override{};
private:
};
template <DataLayoutType DLType> template <DataLayoutType DLType>
class CalibComputeInt8ToFp32 class CalibComputeInt8ToFp32
: public KernelLite<TARGET(kARM), PRECISION(kInt8), DLType> { : public KernelLite<TARGET(kARM), PRECISION(kInt8), DLType> {
......
...@@ -62,8 +62,19 @@ void CastCompute::Run() { ...@@ -62,8 +62,19 @@ void CastCompute::Run() {
int32_t* out_data = param.Out->mutable_data<int32_t>(); int32_t* out_data = param.Out->mutable_data<int32_t>();
std::transform( std::transform(
x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>); x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
} else if (param.in_dtype == 0 && param.out_dtype == 5) { // bool->fp32
const bool* x_data_begin = param.X->data<bool>();
const bool* x_data_end = x_data_begin + param.X->numel();
float* out_data = param.Out->mutable_data<float>();
std::transform(x_data_begin, x_data_end, out_data, TransOp<bool, float>);
} else if (param.in_dtype == 3 && param.out_dtype == 5) { // int64->fp32
const int64_t* x_data_begin = param.X->data<int64_t>();
const int64_t* x_data_end = x_data_begin + param.X->numel();
float* out_data = param.Out->mutable_data<float>();
std::transform(x_data_begin, x_data_end, out_data, TransOp<int64_t, float>);
} else { } else {
LOG(FATAL) << "other has not been implemented"; LOG(FATAL) << "other has not been implemented transform with dtype"
<< param.in_dtype << " X, dtype" << param.out_dtype << " Out";
} }
} }
......
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
#pragma once #pragma once
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -36,6 +39,13 @@ class ConvCompute : public KernelLite<TARGET(kARM), Ptype> { ...@@ -36,6 +39,13 @@ class ConvCompute : public KernelLite<TARGET(kARM), Ptype> {
impl_->Run(); impl_->Run();
} }
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
impl_->SetProfileRuntimeKernelInfo(ch);
}
#endif
~ConvCompute() { ~ConvCompute() {
if (impl_ != nullptr) { if (impl_ != nullptr) {
delete impl_; delete impl_;
......
...@@ -50,6 +50,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -50,6 +50,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
flag_trans_weights_ = true; flag_trans_weights_ = true;
} }
impl_ = lite::arm::math::conv_depthwise_3x3_fp32; impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_3x3_fp32";
#endif
} else if (kw == 5) { } else if (kw == 5) {
// VLOG(5) << "invoke 5x5 dw conv fp32"; // VLOG(5) << "invoke 5x5 dw conv fp32";
auto strides = param.strides; auto strides = param.strides;
...@@ -67,6 +70,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -67,6 +70,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
w_data_in, w_data, oc, 1, cblock, kh * kw); w_data_in, w_data, oc, 1, cblock, kh * kw);
flag_trans_weights_ = true; flag_trans_weights_ = true;
impl_ = lite::arm::math::conv_depthwise_5x5_fp32; impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_5x5_fp32";
#endif
} else { } else {
LOG(FATAL) LOG(FATAL)
<< "5x5 depthwise conv only support stride == 1 or stride == 2"; << "5x5 depthwise conv only support stride == 1 or stride == 2";
...@@ -103,6 +109,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() { ...@@ -103,6 +109,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
// trans weights // trans weights
// VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out"; // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32; impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_3x3_int8_fp32";
#endif
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
auto wptr = param.filter->data<int8_t>(); auto wptr = param.filter->data<int8_t>();
...@@ -113,6 +122,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() { ...@@ -113,6 +122,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
// trans weights // trans weights
// VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out"; // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32; impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_5x5_int8_fp32";
#endif
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
auto wptr = param.filter->data<int8_t>(); auto wptr = param.filter->data<int8_t>();
...@@ -162,6 +174,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -162,6 +174,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
// trans weights // trans weights
// VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out"; // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8; impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_3x3_int8_int8";
#endif
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
auto wptr = param.filter->data<int8_t>(); auto wptr = param.filter->data<int8_t>();
...@@ -172,6 +187,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -172,6 +187,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
// trans weights // trans weights
// VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out"; // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8; impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_depthwise_5x5_int8_int8";
#endif
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
auto wptr = param.filter->data<int8_t>(); auto wptr = param.filter->data<int8_t>();
...@@ -183,6 +201,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -183,6 +201,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
} }
} }
#ifdef LITE_WITH_PROFILE
template <>
void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -225,6 +251,14 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -225,6 +251,14 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
w_scale_.data()); w_scale_.data());
} }
#ifdef LITE_WITH_PROFILE
template <>
void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -267,6 +301,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -267,6 +301,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
w_scale_.data()); w_scale_.data());
} }
#ifdef LITE_WITH_PROFILE
template <>
void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <cmath> #include <cmath>
#include <string>
#include <vector> #include <vector>
#include "lite/backends/arm/math/conv_impl.h" #include "lite/backends/arm/math/conv_impl.h"
#include "lite/core/context.h" #include "lite/core/context.h"
...@@ -48,6 +49,15 @@ class DepthwiseConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -48,6 +49,15 @@ class DepthwiseConv : public KernelLite<TARGET(kARM), Ptype> {
virtual void PrepareForRun(); virtual void PrepareForRun();
virtual void Run(); virtual void Run();
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
std::string kernel_func_name_{"NotImplForConvDw"};
#endif
private: private:
using param_t = operators::ConvParam; using param_t = operators::ConvParam;
Tensor weights_; Tensor weights_;
......
...@@ -19,6 +19,14 @@ namespace lite { ...@@ -19,6 +19,14 @@ namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
#ifdef LITE_WITH_PROFILE
template <>
void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -62,6 +70,9 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -62,6 +70,9 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
b_data, b_data,
param, param,
&ctx); &ctx);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_3x3s1_direct_fp32";
#endif
} else { } else {
lite::arm::math::conv_3x3s2_direct_fp32(i_data, lite::arm::math::conv_3x3s2_direct_fp32(i_data,
o_data, o_data,
...@@ -76,9 +87,20 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -76,9 +87,20 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
b_data, b_data,
param, param,
&ctx); &ctx);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_3x3s2_direct_fp32";
#endif
} }
} }
#ifdef LITE_WITH_PROFILE
template <>
void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -117,6 +139,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -117,6 +139,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_3x3s1_direct_int8";
#endif
} else { } else {
lite::arm::math::conv_3x3s2_direct_int8(i_data, lite::arm::math::conv_3x3s2_direct_int8(i_data,
o_data, o_data,
...@@ -132,9 +157,20 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -132,9 +157,20 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_3x3s2_direct_int8";
#endif
} }
} }
#ifdef LITE_WITH_PROFILE
template <>
void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -173,6 +209,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { ...@@ -173,6 +209,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_3x3s1_direct_int8";
#endif
} else { } else {
lite::arm::math::conv_3x3s2_direct_int8(i_data, lite::arm::math::conv_3x3s2_direct_int8(i_data,
o_data, o_data,
...@@ -188,6 +227,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { ...@@ -188,6 +227,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_3x3s2_direct_int8";
#endif
} }
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <cmath> #include <cmath>
#include <string>
#include <vector> #include <vector>
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#include "lite/core/context.h" #include "lite/core/context.h"
...@@ -180,6 +181,15 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -180,6 +181,15 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {
virtual void Run(); virtual void Run();
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
std::string kernel_func_name_{"NotImplForConvDirect"};
#endif
/// todo, support inplace weights transform /// todo, support inplace weights transform
protected: protected:
Tensor weights_; Tensor weights_;
......
...@@ -81,6 +81,14 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -81,6 +81,14 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
} }
} }
#ifdef LITE_WITH_PROFILE
template <>
void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -111,12 +119,26 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -111,12 +119,26 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
if (flag_1x1gemm_) { if (flag_1x1gemm_) {
lite::arm::math::conv1x1s1_gemm( lite::arm::math::conv1x1s1_gemm(
din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx); din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv1x1s1_gemm";
#endif
} else { } else {
lite::arm::math::conv_im2col_gemm( lite::arm::math::conv_im2col_gemm(
din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx); din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_im2col_gemm";
#endif
} }
} }
#ifdef LITE_WITH_PROFILE
template <>
void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -159,6 +181,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -159,6 +181,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv1x1s1_gemm_int8";
#endif
} else { } else {
lite::arm::math::conv_im2col_gemm_int8(din, lite::arm::math::conv_im2col_gemm_int8(din,
dout, dout,
...@@ -174,9 +199,20 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -174,9 +199,20 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_im2col_gemm_int8";
#endif
} }
} }
#ifdef LITE_WITH_PROFILE
template <>
void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -219,6 +255,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { ...@@ -219,6 +255,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv1x1s1_gemm_int8";
#endif
} else { } else {
lite::arm::math::conv_im2col_gemm_int8(din, lite::arm::math::conv_im2col_gemm_int8(din,
dout, dout,
...@@ -234,6 +273,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { ...@@ -234,6 +273,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
param, param,
&ctx, &ctx,
w_scale_.data()); w_scale_.data());
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_im2col_gemm_int8";
#endif
} }
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <cmath> #include <cmath>
#include <string>
#include <vector> #include <vector>
#include "lite/backends/arm/math/conv_impl.h" #include "lite/backends/arm/math/conv_impl.h"
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
...@@ -94,6 +95,15 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -94,6 +95,15 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
virtual void PrepareForRun(); virtual void PrepareForRun();
virtual void Run(); virtual void Run();
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
std::string kernel_func_name_{"NotImplForConvGemm"};
#endif
/// todo, support inplace weights transform /// todo, support inplace weights transform
protected: protected:
using param_t = operators::ConvParam; using param_t = operators::ConvParam;
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <string>
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/operators/conv_transpose_op.h" #include "lite/operators/conv_transpose_op.h"
...@@ -33,6 +34,14 @@ class Conv2DTransposeCompute ...@@ -33,6 +34,14 @@ class Conv2DTransposeCompute
~Conv2DTransposeCompute() = default; ~Conv2DTransposeCompute() = default;
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
std::string kernel_func_name_{"NotImplForConvTranspose"};
#endif
protected: protected:
int workspace_size_{0}; int workspace_size_{0};
}; };
......
...@@ -93,6 +93,14 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -93,6 +93,14 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
ReInitWhenNeeded(); ReInitWhenNeeded();
} }
#ifdef LITE_WITH_PROFILE
template <>
void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::
SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
#endif
template <> template <>
void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -129,6 +137,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -129,6 +137,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
b_data, b_data,
param, param,
&ctx); &ctx);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_compute_6x6_3x3";
#endif
} else { } else {
int tile_block = 8; int tile_block = 8;
int block_count = int block_count =
...@@ -147,6 +158,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -147,6 +158,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
b_data, b_data,
param, param,
&ctx); &ctx);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_compute_2x2_3x3";
#endif
} else { } else {
lite::arm::math::conv_compute_2x2_3x3_small(i_data, lite::arm::math::conv_compute_2x2_3x3_small(i_data,
o_data, o_data,
...@@ -161,6 +175,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -161,6 +175,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
b_data, b_data,
param, param,
&ctx); &ctx);
#ifdef LITE_WITH_PROFILE
kernel_func_name_ = "conv_compute_2x2_3x3_small";
#endif
} }
} }
} }
......
...@@ -35,6 +35,13 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -35,6 +35,13 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
virtual void PrepareForRun(); virtual void PrepareForRun();
virtual void ReInitWhenNeeded(); virtual void ReInitWhenNeeded();
virtual void Run(); virtual void Run();
#ifdef LITE_WITH_PROFILE
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
}
std::string kernel_func_name_{"NotImplForConvWino"};
#endif
protected: protected:
using param_t = operators::ConvParam; using param_t = operators::ConvParam;
......
...@@ -300,11 +300,12 @@ void ElementwiseMaxActivationCompute::Run() { ...@@ -300,11 +300,12 @@ void ElementwiseMaxActivationCompute::Run() {
} }
} }
void ElementwiseDivCompute::Run() { template <typename T, PrecisionType PType>
auto& param = Param<operators::ElementwiseParam>(); void ElementwiseDivCompute<T, PType>::Run() {
const float* x_data = param.X->data<float>(); auto& param = this->template Param<operators::ElementwiseParam>();
const float* y_data = param.Y->data<float>(); auto* x_data = param.X->template data<T>();
float* out_data = param.Out->mutable_data<float>(); auto* y_data = param.Y->template data<T>();
auto* out_data = param.Out->template mutable_data<T>();
int axis = param.axis; int axis = param.axis;
auto x_dims = param.X->dims(); auto x_dims = param.X->dims();
auto y_dims = param.Y->dims(); auto y_dims = param.Y->dims();
...@@ -313,10 +314,10 @@ void ElementwiseDivCompute::Run() { ...@@ -313,10 +314,10 @@ void ElementwiseDivCompute::Run() {
LOG(FATAL) << "elewise div don't support x_dims size < y_dims size"; LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
} }
if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
lite::arm::math::elementwise_div_broadcast( lite::arm::math::elementwise_div_broadcast<T>(
x_data, y_data, out_data, pre, n, post); x_data, y_data, out_data, pre, n, post);
} else { } else {
lite::arm::math::elementwise_div( lite::arm::math::elementwise_div<T>(
x_data, y_data, out_data, x_dims.production()); x_data, y_data, out_data, x_dims.production());
} }
} }
...@@ -465,17 +466,27 @@ REGISTER_LITE_KERNEL( ...@@ -465,17 +466,27 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(elementwise_div, using elementwise_div_fp32 =
kARM, paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;
kFloat,
kNCHW, REGISTER_LITE_KERNEL(
paddle::lite::kernels::arm::ElementwiseDivCompute, elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def)
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
using elementwise_div_int64 =
paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
PRECISION(kInt64)>;
REGISTER_LITE_KERNEL(
elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.Finalize();
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
fusion_elementwise_div_activation, fusion_elementwise_div_activation,
kARM, kARM,
......
...@@ -86,8 +86,8 @@ class ElementwiseMaxActivationCompute ...@@ -86,8 +86,8 @@ class ElementwiseMaxActivationCompute
virtual ~ElementwiseMaxActivationCompute() = default; virtual ~ElementwiseMaxActivationCompute() = default;
}; };
class ElementwiseDivCompute template <typename T, PrecisionType PType>
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> { class ElementwiseDivCompute : public KernelLite<TARGET(kARM), PType> {
public: public:
void Run() override; void Run() override;
......
...@@ -73,7 +73,6 @@ void GatherCompute::Run() { ...@@ -73,7 +73,6 @@ void GatherCompute::Run() {
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def) gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindInput("Index", .BindInput("Index", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize(); .Finalize();
...@@ -34,6 +34,9 @@ add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc ...@@ -34,6 +34,9 @@ add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc
add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
# extra # extra
# wait to add ... # wait to add ...
...@@ -73,6 +76,12 @@ lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc ...@@ -73,6 +76,12 @@ lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
DEPS layout_opencl op_registry program context) DEPS layout_opencl op_registry program context)
lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
DEPS pixel_shuffle_opencl op_registry program context)
lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
DEPS expand_opencl op_registry program context)
lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context) DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
......
...@@ -18,6 +18,10 @@ ...@@ -18,6 +18,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -63,16 +67,24 @@ class ReluCompute ...@@ -63,16 +67,24 @@ class ReluCompute
auto global_work_size = cl::NDRange{count}; auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
std::string kernel_func_name_{"relu"}; std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"}; std::string build_options_{"-DCL_DTYPE_float -DRELU"};
...@@ -120,16 +132,24 @@ class SigmoidCompute ...@@ -120,16 +132,24 @@ class SigmoidCompute
auto global_work_size = cl::NDRange{count}; auto global_work_size = cl::NDRange{count};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
std::string kernel_func_name_{"sigmoid"}; std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"}; std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
......
...@@ -19,6 +19,10 @@ ...@@ -19,6 +19,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -148,16 +152,24 @@ class ActivationComputeImageDefault ...@@ -148,16 +152,24 @@ class ActivationComputeImageDefault
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size_, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
param_t* act_param_{nullptr}; param_t* act_param_{nullptr};
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>( DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -142,13 +146,13 @@ class BilinearInterpImageCompute ...@@ -142,13 +146,13 @@ class BilinearInterpImageCompute
static_cast<cl::size_type>(default_work_size[1]), static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2])}; static_cast<cl::size_type>(default_work_size[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG #ifdef LITE_WITH_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -121,13 +125,13 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -121,13 +125,13 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]), cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[2])}; static_cast<cl::size_type>(default_work_size[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG #ifdef LITE_WITH_LOG
...@@ -138,6 +142,14 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -138,6 +142,14 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
} }
std::string doc() { return "Boxcoder using cl::Image, kFP16"; } std::string doc() { return "Boxcoder using cl::Image, kFP16"; }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
param_t* boxcoder_param_{nullptr}; param_t* boxcoder_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{" -DCL_DTYPE_half"}; std::string build_options_{" -DCL_DTYPE_half"};
......
...@@ -18,6 +18,10 @@ ...@@ -18,6 +18,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -124,13 +128,13 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL), ...@@ -124,13 +128,13 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
status = kernel.setArg(++arg_idx, total1); status = kernel.setArg(++arg_idx, total1);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} else { } else {
auto start = 0; auto start = 0;
...@@ -157,13 +161,13 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL), ...@@ -157,13 +161,13 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
status = kernel.setArg(++arg_idx, total0); status = kernel.setArg(++arg_idx, total0);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
start += size; start += size;
} }
...@@ -172,6 +176,14 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL), ...@@ -172,6 +176,14 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
std::string doc() { return "Concat using cl::Buffer, kFloat"; } std::string doc() { return "Concat using cl::Buffer, kFloat"; }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
int axis_size_ = 1; int axis_size_ = 1;
int post_size_ = 1; int post_size_ = 1;
int pre_size_ = 1; int pre_size_ = 1;
......
...@@ -19,6 +19,10 @@ ...@@ -19,6 +19,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -246,6 +250,14 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -246,6 +250,14 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
std::string doc() { return "Concat using cl::Image, kFP16"; } std::string doc() { return "Concat using cl::Image, kFP16"; }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
int axis_size_ = 1; int axis_size_ = 1;
int axis_ = 1; int axis_ = 1;
int flag_ = 1; int flag_ = 1;
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -39,6 +43,14 @@ class ConvCompute ...@@ -39,6 +43,14 @@ class ConvCompute
void Run() override; void Run() override;
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_names_[0];
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
void GemmlikeConv2d(); void GemmlikeConv2d();
void Conv2d1x1(); void Conv2d1x1();
......
...@@ -24,11 +24,16 @@ ...@@ -24,11 +24,16 @@
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class ConvImageCompute : public KernelLite<TARGET(kOpenCL), class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
...@@ -38,20 +43,37 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -38,20 +43,37 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override; void PrepareForRun() override;
void ReInitWhenNeeded() override;
void Run() override; void Run() override;
double Turn(int times = 5);
double Tune(int times = 5);
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_names_[0];
ch->global_work_size = ch->NDRangeToStr(global_work_size_);
ch->local_work_size = ch->NDRangeToStr(local_work_size_);
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
void Conv2d1x1opt(bool is_turn = false); void PrintConvInfo();
void Conv2d3x3(bool is_turn = false); void GetGlobalWorkSize();
void Conv2d3x3opt(bool is_turn = false); void Conv2d1x1opt(bool enable_tune = false);
void Conv2d5x5(bool is_turn = false); void Conv2d3x3(bool enable_tune = false);
void Conv2d5x5opt(bool is_turn = false); void Conv2d3x3opt(bool enable_tune = false);
void Conv2d7x7(bool is_turn = false); void Conv2d5x5(bool enable_tune = false);
void Conv2d7x7opt(bool is_turn = false); void Conv2d5x5opt(bool enable_tune = false);
void DepthwiseConv2d3x3s1(bool is_turn = false); void Conv2d7x7(bool enable_tune = false);
void DepthwiseConv2d3x3(bool is_turn = false); void Conv2d7x7opt(bool enable_tune = false);
void DepthwiseConv2d(bool is_turn = false); void DepthwiseConv2d3x3s1(bool enable_tune = false);
void DepthwiseConv2d3x3(bool enable_tune = false);
void DepthwiseConv2d(bool enable_tune = false);
param_t* conv_param_{nullptr};
kernel_t impl_; kernel_t impl_;
std::vector<std::string> kernel_func_names_{}; std::vector<std::string> kernel_func_names_{};
...@@ -65,19 +87,72 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -65,19 +87,72 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr}; std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
cl::NDRange global_work_size_ = cl::NDRange{ cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)}; static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
// opencl kernel args
int c_blk_ = 1; int c_blk_ = 1;
int w_blk_ = 1; int w_blk_ = 1;
int nh_blk_ = 1; int nh_blk_ = 1;
const cl::Image2D* input_image_p_{nullptr};
const cl::Image2D* filter_image_p_{nullptr};
const cl::Image2D* bias_image_p_{nullptr};
const cl::Image2D* output_image_p_{nullptr};
int stride_h_{-1};
int stride_w_{-1};
int dilation_h_{-1};
int dilation_w_{-1};
int pad_up_{-1};
int pad_down_{-1};
int pad_left_{-1};
int pad_right_{-1};
int offset_{-1};
int groups_{-1};
bool relu_fused_{false};
bool has_bias_{false};
int input_tensor_n_{-1};
int input_tensor_c_{-1};
int input_tensor_h_{-1};
int input_tensor_w_{-1};
int input_image_h_{-1};
int input_image_w_{-1};
int input_c_block_{-1};
int output_tensor_n_{-1};
int output_tensor_c_{-1};
int output_tensor_h_{-1};
int output_tensor_w_{-1};
int output_image_h_{-1};
int output_image_w_{-1};
int filter_tensor_n_{-1};
int filter_tensor_c_{-1};
int filter_tensor_h_{-1};
int filter_tensor_w_{-1};
int filter_image_h_{-1};
int filter_image_w_{-1};
int bias_image_h_{-1};
int bias_image_w_{-1};
int default_c_blk_ = 1; int default_c_blk_ = 1;
int default_w_blk_ = 1; int default_w_blk_ = 1;
int default_nh_blk_ = 1; int default_nh_blk_ = 1;
// =================
DDim last_input_dims_{};
bool is_first_epoch_for_run_{true};
cl::Kernel kernel_; cl::Kernel kernel_;
cl_int status_;
cl::NDRange local_work_size_ = cl::NDRange{ cl::NDRange local_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)}; static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
bool use_lws_{true}; bool use_lws_{true};
bool use_turn_{false}; bool use_tune_{false};
}; };
} // namespace opencl } // namespace opencl
......
...@@ -20,6 +20,10 @@ ...@@ -20,6 +20,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -119,6 +123,14 @@ class DepthwiseConv2dCompute ...@@ -119,6 +123,14 @@ class DepthwiseConv2dCompute
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
std::string kernel_func_name_{"depthwise_conv2d"}; std::string kernel_func_name_{"depthwise_conv2d"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
......
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -89,16 +93,24 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -89,16 +93,24 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
static_cast<cl::size_type>(default_work_size.data()[1]), static_cast<cl::size_type>(default_work_size.data()[1]),
static_cast<cl::size_type>(default_work_size.data()[2])}; static_cast<cl::size_type>(default_work_size.data()[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
std::string kernel_func_name_{"dropout"}; std::string kernel_func_name_{"dropout"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
......
...@@ -19,6 +19,10 @@ ...@@ -19,6 +19,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -38,6 +42,14 @@ class ElementwiseAddCompute ...@@ -38,6 +42,14 @@ class ElementwiseAddCompute
return "ElementwiseAdd using cl::Buffer, kFloat"; return "ElementwiseAdd using cl::Buffer, kFloat";
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
protected: protected:
void UpdateParams(); void UpdateParams();
......
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#undef LITE_WITH_LOG
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
...@@ -154,13 +156,13 @@ void ElementwiseAddImageCompute::Run() { ...@@ -154,13 +156,13 @@ void ElementwiseAddImageCompute::Run() {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size_, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
...@@ -196,3 +198,5 @@ REGISTER_LITE_KERNEL(elementwise_add, ...@@ -196,3 +198,5 @@ REGISTER_LITE_KERNEL(elementwise_add,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
#define LITE_WITH_LOG
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -42,6 +46,14 @@ class ElementwiseAddImageCompute ...@@ -42,6 +46,14 @@ class ElementwiseAddImageCompute
void Run() override; void Run() override;
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
std::string doc() const override { std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFP16"; return "ElementwiseAdd using cl::Image2D, kFP16";
} }
......
...@@ -153,13 +153,13 @@ void ElementwiseMulFloatImageCompute::Run() { ...@@ -153,13 +153,13 @@ void ElementwiseMulFloatImageCompute::Run() {
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width), auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)}; static_cast<cl::size_type>(x_img_height)};
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel auto status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
std::string time_stamp_{GetTimeStamp()}; std::string time_stamp_{GetTimeStamp()};
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -186,13 +190,13 @@ class ElementwiseMulImageCompute ...@@ -186,13 +190,13 @@ class ElementwiseMulImageCompute
cl::NDRange{static_cast<cl::size_type>(x_img_width), cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)}; static_cast<cl::size_type>(x_img_height)};
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( auto status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG #ifdef LITE_WITH_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height; VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
......
...@@ -138,8 +138,13 @@ void ElementwiseSubImageCompute::Run() { ...@@ -138,8 +138,13 @@ void ElementwiseSubImageCompute::Run() {
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height; VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif #endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( auto status = EnqueueNDRangeKernel(context,
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
......
...@@ -20,6 +20,10 @@ ...@@ -20,6 +20,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -41,6 +45,14 @@ class ElementwiseSubImageCompute ...@@ -41,6 +45,14 @@ class ElementwiseSubImageCompute
return "ElementwiseSub using cl::Image2D, kFP16"; return "ElementwiseSub using cl::Image2D, kFP16";
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
protected: protected:
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_sub"}; std::string kernel_func_name_{"elementwise_sub"};
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ExpandComputeImage2D : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ExpandParam;
std::string doc() const override { return "expand using cl::Image2D, kFP16"; }
void PrepareForRun() override {
expand_param_ = param_.get_mutable<param_t>();
auto expand_times = expand_param_->expand_times;
auto in_dims = expand_param_->X->dims();
CHECK(in_dims.size() == 4) << "expand image now only support indims size 4";
CHECK(expand_times.size() == 4)
<< "expand image now only support in_expand_timesdims size 4";
CHECK(expand_times[1] == 1) << "expand image do not support expend c now";
// do not confuse with these cases.it is use to support expend c in future
if (in_dims[1] == 1) {
kernel_func_name_ = "expend_c1";
} else if (in_dims[1] == 2) {
kernel_func_name_ = "expend_c2";
} else if (in_dims[1] == 3) {
kernel_func_name_ = "expend_c3";
} else if (in_dims[1] == 4) {
kernel_func_name_ = "expend_c4";
} else {
kernel_func_name_ = "expend_cn";
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/expand_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
VLOG(1) << "ReInitWhenNeeded: " << kernel_func_name_;
auto x_dims = expand_param_->X->dims();
auto out_dims = expand_param_->Out->dims();
auto expand_times = expand_param_->expand_times;
VLOG(1) << "x_dims: " << x_dims;
VLOG(1) << "out_dims: " << out_dims;
VLOG(1) << "expand_times: " << expand_times[0] << " " << expand_times[1]
<< " " << expand_times[2] << " " << expand_times[3];
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ = default_convertor.InitImageDimInfoWith(out_dims);
VLOG(1) << "out_img_shape_: " << out_img_shape_[0] << " "
<< out_img_shape_[1];
// compute global work size
auto image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
size_t work_size_0 = image_width / out_dims[3];
size_t work_size_1 = out_dims[3];
size_t work_size_2 = out_dims[0] * out_dims[2];
global_work_size_ = cl::NDRange{work_size_0, work_size_1, work_size_2};
VLOG(1) << "global_work_size_: " << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
}
}
void Run() override {
auto* x_img = expand_param_->X->data<half_t, cl::Image2D>();
auto* out_img = expand_param_->Out->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto expand_times = expand_param_->expand_times;
auto x_dims = expand_param_->X->dims();
int in_n = x_dims[0];
int in_c = x_dims[1];
int in_h = x_dims[2];
int in_w = x_dims[3];
auto out_dims = expand_param_->Out->dims();
int out_n = out_dims[0];
int out_c = out_dims[1];
int out_h = out_dims[2];
int out_w = out_dims[3];
auto out_image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
int out_c_block = out_image_width / out_dims[3];
int out_nh = out_dims[0] * out_dims[2];
auto in_image_width = x_dims[3] * ((x_dims[1] + 3) / 4);
int in_c_block = in_image_width / x_dims[3];
int in_nh = x_dims[0] * x_dims[2];
int expand_times_n = expand_times[0];
int expand_times_c = expand_times[1];
int expand_times_h = expand_times[2];
int expand_times_w = expand_times[3];
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, out_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, out_nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, in_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(5, in_nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(6, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(7, in_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(8, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(9, out_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(10, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(11, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(12, expand_times_n);
CL_CHECK_FATAL(status);
status = kernel.setArg(13, expand_times_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(14, expand_times_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(15, expand_times_w);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private:
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
param_t* expand_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(expand,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ExpandComputeImage2D,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
此差异已折叠。
...@@ -20,6 +20,10 @@ ...@@ -20,6 +20,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -141,16 +145,24 @@ class FcCompute ...@@ -141,16 +145,24 @@ class FcCompute
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size_, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
int m_, n_, k_; int m_, n_, k_;
param_t* fc_param_{nullptr}; param_t* fc_param_{nullptr};
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -131,16 +135,24 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -131,16 +135,24 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size_, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
protected: protected:
param_t* grid_param_{nullptr}; param_t* grid_param_{nullptr};
bool first_epoch_for_reinit_{true}; bool first_epoch_for_reinit_{true};
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -137,13 +141,13 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -137,13 +141,13 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
status = kernel.setArg(7, *out_img); status = kernel.setArg(7, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
local_work_size, local_work_size,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
...@@ -258,17 +262,25 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -258,17 +262,25 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
status = kernel.setArg(arg_idx++, in_w); status = kernel.setArg(arg_idx++, in_w);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
local_work_size, local_work_size,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#endif #endif
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
protected: protected:
param_t* instance_norm_param_{nullptr}; param_t* instance_norm_param_{nullptr};
std::string kernel_func_name_{"instance_norm_onnx"}; std::string kernel_func_name_{"instance_norm_onnx"};
......
...@@ -16,19 +16,46 @@ ...@@ -16,19 +16,46 @@
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#undef LITE_WITH_LOG
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
inline double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
// Host to OpenCL memory. // Host to OpenCL memory.
void CopyFromHostSync(void* target, const void* source, size_t size) { float CopyFromHostSync(void* target, const void* source, size_t size) {
#ifdef LITE_WITH_PROFILE
auto h2d_copy_start = GetCurrentUS();
#endif
TargetWrapperCL::MemcpySync(target, source, size, IoDirection::HtoD); TargetWrapperCL::MemcpySync(target, source, size, IoDirection::HtoD);
#ifdef LITE_WITH_PROFILE
auto h2d_duration = (GetCurrentUS() - h2d_copy_start) / 1000.0;
return h2d_duration;
#else
return 0.0;
#endif
} }
// Device to Host memory. // Device to Host memory.
void CopyToHostSync(void* target, const void* source, size_t size) { float CopyToHostSync(void* target, const void* source, size_t size) {
#ifdef LITE_WITH_PROFILE
auto d2h_copy_start = GetCurrentUS();
#endif
CLRuntime::Global()->command_queue().finish();
TargetWrapperCL::MemcpySync(target, source, size, IoDirection::DtoH); TargetWrapperCL::MemcpySync(target, source, size, IoDirection::DtoH);
#ifdef LITE_WITH_PROFILE
auto d2h_duration = (GetCurrentUS() - d2h_copy_start) / 1000.0;
return d2h_duration;
#else
return 0.0;
#endif
} }
/* /*
...@@ -37,6 +64,13 @@ void CopyToHostSync(void* target, const void* source, size_t size) { ...@@ -37,6 +64,13 @@ void CopyToHostSync(void* target, const void* source, size_t size) {
class IoCopyHostToOpenCLCompute class IoCopyHostToOpenCLCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> { : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public: public:
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = "HostToOpenCL";
ch->io_duration = h2d_duration_;
}
#endif
void Run() override { void Run() override {
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) || CHECK(param.x->target() == TARGET(kHost) ||
...@@ -50,7 +84,7 @@ class IoCopyHostToOpenCLCompute ...@@ -50,7 +84,7 @@ class IoCopyHostToOpenCLCompute
VLOG(2) << "param.y->dims():" << param.y->dims(); VLOG(2) << "param.y->dims():" << param.y->dims();
#endif #endif
auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size); auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
CopyFromHostSync(data, param.x->raw_data(), mem_size); h2d_duration_ = CopyFromHostSync(data, param.x->raw_data(), mem_size);
} }
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override { std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
...@@ -74,6 +108,8 @@ class IoCopyHostToOpenCLCompute ...@@ -74,6 +108,8 @@ class IoCopyHostToOpenCLCompute
} }
std::string doc() const override { return "Copy IO from HOST to OpenCL"; } std::string doc() const override { return "Copy IO from HOST to OpenCL"; }
float h2d_duration_{0};
}; };
/* /*
...@@ -82,6 +118,13 @@ class IoCopyHostToOpenCLCompute ...@@ -82,6 +118,13 @@ class IoCopyHostToOpenCLCompute
class IoCopykOpenCLToHostCompute class IoCopykOpenCLToHostCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> { : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public: public:
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = "OpenCLToHost";
ch->io_duration = d2h_duration_;
}
#endif
void Run() override { void Run() override {
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kOpenCL)); CHECK(param.x->target() == TARGET(kOpenCL));
...@@ -109,12 +152,13 @@ class IoCopykOpenCLToHostCompute ...@@ -109,12 +152,13 @@ class IoCopykOpenCLToHostCompute
#ifdef LITE_WITH_LOG #ifdef LITE_WITH_LOG
VLOG(2) << "--- Find the sync event for the target cl tensor. ---"; VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
#endif #endif
CLRuntime::Global()->command_queue().finish();
CopyToHostSync(data, param.x->raw_data(), mem_size); d2h_duration_ = CopyToHostSync(data, param.x->raw_data(), mem_size);
} }
std::string doc() const override { return "Copy IO from OpenCL to HOST"; } std::string doc() const override { return "Copy IO from OpenCL to HOST"; }
float d2h_duration_{0};
}; };
} // namespace opencl } // namespace opencl
...@@ -161,3 +205,5 @@ REGISTER_LITE_KERNEL(io_copy_once, ...@@ -161,3 +205,5 @@ REGISTER_LITE_KERNEL(io_copy_once,
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize(); .Finalize();
#define LITE_WITH_LOG
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <string> #include <string>
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
#include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
...@@ -24,6 +25,8 @@ ...@@ -24,6 +25,8 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#undef LITE_WITH_LOG
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
...@@ -50,6 +53,14 @@ class LayoutComputeBufferChwToImageDefault ...@@ -50,6 +53,14 @@ class LayoutComputeBufferChwToImageDefault
time_stamp_); time_stamp_);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
const cl::Buffer* x_data; const cl::Buffer* x_data;
...@@ -128,13 +139,13 @@ class LayoutComputeBufferChwToImageDefault ...@@ -128,13 +139,13 @@ class LayoutComputeBufferChwToImageDefault
static_cast<cl::size_type>(new_dims[3]), static_cast<cl::size_type>(new_dims[3]),
static_cast<cl::size_type>(new_dims[0] * new_dims[2])}; static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
...@@ -168,6 +179,14 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -168,6 +179,14 @@ class LayoutComputeImageDefaultToBufferChw
time_stamp_); time_stamp_);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
const cl::Buffer* y_data; const cl::Buffer* y_data;
...@@ -237,13 +256,13 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -237,13 +256,13 @@ class LayoutComputeImageDefaultToBufferChw
static_cast<cl::size_type>(new_dims[3]), static_cast<cl::size_type>(new_dims[3]),
static_cast<cl::size_type>(new_dims[0] * new_dims[2])}; static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
...@@ -274,6 +293,14 @@ class LayoutComputeBufferChwToImage2DNw ...@@ -274,6 +293,14 @@ class LayoutComputeBufferChwToImage2DNw
time_stamp_); time_stamp_);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
auto* x_data = param.x->data<float, cl::Buffer>(); auto* x_data = param.x->data<float, cl::Buffer>();
...@@ -333,13 +360,13 @@ class LayoutComputeBufferChwToImage2DNw ...@@ -333,13 +360,13 @@ class LayoutComputeBufferChwToImage2DNw
static_cast<cl::size_type>(out_W), // w static_cast<cl::size_type>(out_W), // w
static_cast<cl::size_type>(out_C * out_H)}; // ch static_cast<cl::size_type>(out_C * out_H)}; // ch
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
...@@ -394,3 +421,4 @@ REGISTER_LITE_KERNEL( ...@@ -394,3 +421,4 @@ REGISTER_LITE_KERNEL(
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
#define LITE_WITH_LOG
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -128,13 +132,13 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -128,13 +132,13 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
static_cast<cl::size_type>(default_work_size[1]), static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2])}; static_cast<cl::size_type>(default_work_size[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG #ifdef LITE_WITH_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
...@@ -142,6 +146,14 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -142,6 +146,14 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
#endif #endif
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
protected: protected:
param_t* lrn_param_{nullptr}; param_t* lrn_param_{nullptr};
int n_{5}; int n_{5};
......
...@@ -20,6 +20,10 @@ ...@@ -20,6 +20,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -92,16 +96,24 @@ class MulCompute ...@@ -92,16 +96,24 @@ class MulCompute
auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4), auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)}; static_cast<size_t>((n_ + 3) / 4)};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
int m_, n_, k_; int m_, n_, k_;
std::string kernel_func_name_{"mat_mul"}; std::string kernel_func_name_{"mat_mul"};
......
...@@ -19,6 +19,10 @@ ...@@ -19,6 +19,10 @@
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -110,16 +114,24 @@ class NearestInterpComputeImageDefault ...@@ -110,16 +114,24 @@ class NearestInterpComputeImageDefault
static_cast<cl::size_type>(default_work_size.data()[1]), static_cast<cl::size_type>(default_work_size.data()[1]),
static_cast<cl::size_type>(default_work_size.data()[2])}; static_cast<cl::size_type>(default_work_size.data()[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
std::string kernel_func_name_{"nearest_interp"}; std::string kernel_func_name_{"nearest_interp"};
std::string build_options_{" -DCL_DTYPE_half"}; std::string build_options_{" -DCL_DTYPE_half"};
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/logging.h" #include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -142,13 +146,13 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL), ...@@ -142,13 +146,13 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
static_cast<cl::size_type>(default_work_size[1]), static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2])}; static_cast<cl::size_type>(default_work_size[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG #ifdef LITE_WITH_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
...@@ -156,6 +160,14 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL), ...@@ -156,6 +160,14 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
#endif #endif
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
protected: protected:
param_t* pad2d_param_{nullptr}; param_t* pad2d_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class PixelShuffleComputeImage2D
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::PixelShuffleParam;
std::string doc() const override {
return "PixelShuffle using cl::Image2D, kFP16";
}
void PrepareForRun() override {
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/pixel_shuffle_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
VLOG(1) << "ReInitWhenNeeded: " << kernel_func_name_;
pixel_shuffle_param_ = param_.get_mutable<param_t>();
auto x_dims = pixel_shuffle_param_->x->dims();
auto out_dims = pixel_shuffle_param_->output->dims();
VLOG(1) << "x_dims: " << x_dims;
VLOG(1) << "out_dims: " << out_dims;
VLOG(1) << "upscale_factor: " << pixel_shuffle_param_->upscale_factor;
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ = default_convertor.InitImageDimInfoWith(
pixel_shuffle_param_->output->dims());
VLOG(1) << "out_img_shape_: " << out_img_shape_[0] << " "
<< out_img_shape_[1];
// compute global work size
auto image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
size_t work_size_0 = image_width / out_dims[3];
size_t work_size_1 = out_dims[3];
size_t work_size_2 = out_dims[0] * out_dims[2];
global_work_size_ = cl::NDRange{work_size_0, work_size_1, work_size_2};
VLOG(1) << "global_work_size_: " << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
}
}
void Run() override {
auto* x_img = pixel_shuffle_param_->x->data<half_t, cl::Image2D>();
auto* out_img =
pixel_shuffle_param_->output->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto x_dims = pixel_shuffle_param_->x->dims();
int in_n = x_dims[0];
int in_c = x_dims[1];
int in_h = x_dims[2];
int in_w = x_dims[3];
auto out_dims = pixel_shuffle_param_->output->dims();
int out_n = out_dims[0];
int out_c = out_dims[1];
int out_h = out_dims[2];
int out_w = out_dims[3];
const int upscale_factor = pixel_shuffle_param_->upscale_factor;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, in_n);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, in_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, in_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(5, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(6, out_n);
CL_CHECK_FATAL(status);
status = kernel.setArg(7, out_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(8, out_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(9, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(10, upscale_factor);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private:
std::string kernel_func_name_{"pixel_shuffle"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
param_t* pixel_shuffle_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(pixel_shuffle,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::PixelShuffleComputeImage2D,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <random>
#include <gtest/gtest.h>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle {
namespace lite {
TEST(pixel_shuffle_image2d, compute) {
LOG(INFO) << "create kernel ...";
auto kernels = KernelRegistry::Global().Create("pixel_shuffle",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
const int INPUT_N = 1;
const int INPUT_C = 4;
const int INPUT_H = 2;
const int INPUT_W = 2;
const int UPSCALE_FACTOR = 2;
auto kernel = std::move(kernels.front());
LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
lite::Tensor x, out;
operators::PixelShuffleParam param;
param.x = &x;
param.output = &out;
param.upscale_factor = UPSCALE_FACTOR;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pixel_shuffle_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pixel_shuffle_context));
const DDim in_dim =
DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
const DDim out_dim = DDim(
std::vector<DDim::value_type>{INPUT_N,
INPUT_C / UPSCALE_FACTOR / UPSCALE_FACTOR,
INPUT_H * UPSCALE_FACTOR,
INPUT_W * UPSCALE_FACTOR});
LOG(INFO) << "in_dim: " << in_dim;
LOG(INFO) << "UPSCALE_FACTOR: " << UPSCALE_FACTOR;
LOG(INFO) << "out_dim: " << out_dim;
x.Resize(in_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-2, 2);
std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
int index = 0;
for (auto& i : input_v) {
i = index++;
}
VLOG(1) << "input_v ..... ";
for (size_t i = 0; i < input_v.size(); i++) {
VLOG(10) << input_v[i];
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter = new CLImageConverterDefault();
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
VLOG(1) << "x_image_data ..... ";
for (size_t i = 0; i < x_image_data.size(); i++) {
VLOG(10) << Half2Float(x_image_data[i]);
}
DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
out_image_shape[1]);
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
std::vector<float> out_data_v{
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15};
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
VLOG(1) << "out_image_data ..... ";
for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
VLOG(10) << Half2Float(out_image_data[i]);
}
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
VLOG(1) << "out_data ..... ";
for (int i = 0; i < out_dim.production(); i++) {
VLOG(10) << out_data[i];
}
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_data_v[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(pixel_shuffle, kOpenCL, kFP16, kImageDefault, image2d);
...@@ -20,6 +20,10 @@ ...@@ -20,6 +20,10 @@
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -106,16 +110,24 @@ class PoolCompute ...@@ -106,16 +110,24 @@ class PoolCompute
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size = cl::NDRange(static_cast<size_t>(numel)); auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = EnqueueNDRangeKernel(context,
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
nullptr); event_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private: private:
std::string kernel_func_name_{"pool_"}; std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
......
此差异已折叠。
...@@ -108,6 +108,7 @@ add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.c ...@@ -108,6 +108,7 @@ add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.c
add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS}) add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS}) add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS}) add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS})
# for OCR specific # for OCR specific
add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册