未验证 提交 3868be2c 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL][PROFILE] Enhance precision profile & Clean opencl code (#3227)

* [LITE][OPENCL] clean code for opencl. test=develop

* [LITE][PROFILER] Enhance Precision Profiler. test=develop

* delete useless var in profiler. test=develop

* add ocl header. test=develop
上级 3896590b
...@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt ...@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper) lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper)
add_dependencies(cl_wrapper opencl_clhpp) add_dependencies(cl_wrapper opencl_clhpp)
...@@ -22,6 +22,12 @@ ...@@ -22,6 +22,12 @@
#include <vector> #include <vector>
#include "lite/core/program.h" #include "lite/core/program.h"
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/kernels/opencl/image_helper.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace profile { namespace profile {
...@@ -45,59 +51,195 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) { ...@@ -45,59 +51,195 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
class PrecisionProfiler { class PrecisionProfiler {
public: public:
explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {} // TODO(ysh329): need to remove `explicit PrecisionProfiler`
~PrecisionProfiler() { // keep this method only for arm/math/conditional
LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr() explicit PrecisionProfiler(const Instruction* inst) {
<< " on Target " << TargetToStr(inst_->kernel()->target()) << " " std::string inst_precison_str = GetInstPrecision(inst);
<< PrecisionToStr(inst_->kernel()->precision()); }
auto tensor_mean = [](const Tensor* in,
PrecisionType ptype, PrecisionProfiler() {}
std::string name = "inst") -> double {
if (!in->data<int8_t>()) { std::string GetSummaryHeader() {
return -99999; using std::setw;
} using std::left;
double sum = 0.; using std::fixed;
switch (ptype) { STL::stringstream ss;
ss << "========================================= "
<< "Detailed Precision Profiler Summary "
<< "=========================================" << std::endl;
ss << setw(45) << left << "operator:(kernel_info)"
<< " " << setw(70) << left << "output_tensor_name:(tensor_info)"
<< " " << setw(15) << left << "tensor_dims"
<< " " << setw(15) << left << "tensor_mean"
<< " " << setw(15) << left << "tensor_standard_deviation" << std::endl;
return ss.str();
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
// check if output tensor unused
bool is_unused(const Tensor* in) {
if (!in->data<int8_t>()) {
return true;
}
return false;
}
void compute_tensor_precision_info(const Tensor* in,
TargetType target_type,
PrecisionType precision_type,
DataLayoutType layout_type,
double* mean,
double* std_dev,
std::string name = "inst") {
std::string unsupported_error_log =
"Unsupported precision profile for kernel registered on" +
TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
DataLayoutToStr(layout_type);
if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
target_type == TARGET(kX86)) {
switch (precision_type) {
case PRECISION(kFloat): { case PRECISION(kFloat): {
auto ptr = in->data<float>(); auto ptr = in->data<float>();
// write_tensorfile<float>(in, name); // write_tensorfile<float>(in, name);
for (int i = 0; i < in->numel(); ++i) { *mean = compute_mean<float>(ptr, in->numel());
sum += ptr[i]; *std_dev =
} compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
return sum / in->numel(); return;
} }
case PRECISION(kAny): { case PRECISION(kAny): {
auto ptr = in->data<float>(); auto ptr = in->data<float>();
// write_tensorfile<float>(in, name); // write_tensorfile<float>(in, name);
for (int i = 0; i < in->numel(); ++i) { *mean = compute_mean<float>(ptr, in->numel());
sum += ptr[i]; *std_dev =
} compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
return sum / in->numel(); return;
} }
case PRECISION(kInt8): { case PRECISION(kInt8): {
auto ptr = in->data<int8_t>(); auto ptr = in->data<int8_t>();
// write_tensorfile<int8_t>(in, name); // write_tensorfile<int8_t>(in, name);
for (int i = 0; i < in->numel(); ++i) { *mean = compute_mean<int8_t>(ptr, in->numel());
sum += ptr[i]; *std_dev =
} compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
return sum / in->numel(); return;
} }
case PRECISION(kInt32): { case PRECISION(kInt32): {
auto ptr = in->data<int32_t>(); auto ptr = in->data<int32_t>();
// write_tensorfile<int32_t>(in, name); // write_tensorfile<int32_t>(in, name);
for (int i = 0; i < in->numel(); ++i) { *mean = compute_mean<int32_t>(ptr, in->numel());
sum += ptr[i]; *std_dev = compute_standard_deviation<int32_t>(
} ptr, in->numel(), true, *mean);
return sum / in->numel(); return;
}
default:
*mean = -333333333333;
*std_dev = -33333333333;
LOG(ERROR) << unsupported_error_log;
return;
}
#ifdef LITE_WITH_OPENCL
} else if (target_type == TARGET(kOpenCL)) {
switch (layout_type) {
case DATALAYOUT(kImageDefault): {
paddle::lite::CLImageConverterDefault default_convertor;
auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
size_t im_w = image_shape[0];
size_t im_h = image_shape[1];
VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
<< im_h;
std::vector<uint16_t> in_data_v(im_w * im_h * 4);
std::vector<float> real_out_v(in->numel());
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(in_data_v.data(),
in->data<uint16_t, cl::Image2D>(),
im_w,
im_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
default_convertor.ImageToNCHW(
in_data_v.data(), real_out_v.data(), image_shape, in->dims());
// write_tensorfile<float>(in, name);
CHECK(real_out_v.size() == in->numel());
*mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
*std_dev = compute_standard_deviation<float>(
real_out_v.data(), in->numel(), true, *mean);
return;
}
case DATALAYOUT(kNCHW): {
std::vector<float> in_data_v(in->numel(), 0);
TargetWrapperCL::MemcpySync(in_data_v.data(),
in->data<float>(),
in->numel() * sizeof(float),
IoDirection::DtoH);
VLOG(1) << name << ":" << in->numel();
*mean = compute_mean<float>(in_data_v.data(), in->numel());
*std_dev = compute_standard_deviation<float>(
in_data_v.data(), in->numel(), true, *mean);
return;
} }
default: default:
LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype); *mean = -222222222222;
return 0.; *std_dev = -22222222222;
LOG(ERROR) << unsupported_error_log;
return;
} }
}; #endif
if (inst_->op()->op_info()->Type() != "fetch") { } else {
auto op = const_cast<lite::OpLite*>(inst_->op()); *mean = -111111111111;
auto kernel = inst_->kernel(); *std_dev = -11111111111;
LOG(ERROR) << unsupported_error_log;
return;
}
}
std::string GetInstPrecision(const Instruction* inst = nullptr) {
using std::setw;
using std::left;
using std::fixed;
STL::stringstream ss;
VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
<< " registered on " << TargetToStr(inst->kernel()->target()) << "/"
<< PrecisionToStr(inst->kernel()->precision()) << "/"
<< DataLayoutToStr(inst->kernel()->layout());
std::string kernel_repr = inst->op()->op_info()->Repr();
std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
PrecisionToStr(inst->kernel()->precision()) +
"/" + DataLayoutToStr(inst->kernel()->layout());
std::string op_name = inst->op()->op_info()->Type();
if (inst->op()->op_info()->Type() != "fetch") {
auto op = const_cast<lite::OpLite*>(inst->op());
auto kernel = inst->kernel();
auto op_scope = op->scope(); auto op_scope = op->scope();
auto out_names = op->op_info()->output_names(); auto out_names = op->op_info()->output_names();
for (auto& out_name : out_names) { for (auto& out_name : out_names) {
...@@ -106,32 +248,78 @@ class PrecisionProfiler { ...@@ -106,32 +248,78 @@ class PrecisionProfiler {
auto type = kernel->GetOutputDeclType(out_arg_name); auto type = kernel->GetOutputDeclType(out_arg_name);
if (type->IsTensor()) { if (type->IsTensor()) {
auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>(); const Tensor* tout =
double mean = tensor_mean(tout, type->precision(), out_name); op_scope->FindVar(out_name)->GetMutable<Tensor>();
LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims() double mean = -999999;
<< ", precision: " << PrecisionToStr(type->precision()) double std_dev = -100000;
<< ", mean value: " << mean << " shape:" << tout->dims(); std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
type->target(),
type->precision(),
type->layout(),
&mean,
&std_dev,
out_name);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
ss << setw(45) << left << kernel_info << " " << setw(70) << left
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << std::endl;
} else if (type->IsTensorList()) { } else if (type->IsTensorList()) {
auto tout = auto touts =
op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>(); op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
for (auto& t : *tout) { for (auto t : *touts) {
double mean = tensor_mean(&t, type->precision(), out_name); const Tensor* tout = &t;
LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims() double mean = -999999;
<< ", precision: " << PrecisionToStr(type->precision()) double std_dev = -100000;
<< ", mean value: " << mean; std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
type->target(),
type->precision(),
type->layout(),
&mean,
&std_dev,
out_name);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
ss << setw(45) << left << kernel_info << " " << setw(70) << left
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << std::endl;
} }
} }
} }
} }
return ss.str();
} }
private:
const Instruction* inst_{nullptr};
}; };
} // namespace profile } // namespace profile
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// TODO(ysh329): need to remove.
// keep this method only for arm/math/conditional_block_compute
#define LITE_PRECISION_PROFILE(inst) \ #define LITE_PRECISION_PROFILE(inst) \
{ auto a = paddle::lite::profile::PrecisionProfiler(&inst); } { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
...@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { ...@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
} }
void RuntimeProgram::Run() { void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
std::string precision_profiler_summary =
inst_precision_profiler.GetSummaryHeader();
#endif
#endif
for (auto& inst : instructions_) { for (auto& inst : instructions_) {
#ifndef LITE_WITH_FPGA #ifndef LITE_WITH_FPGA
if (inst.is_feed_fetch_op()) continue; if (inst.is_feed_fetch_op()) continue;
...@@ -144,13 +152,17 @@ void RuntimeProgram::Run() { ...@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA #ifndef LITE_WITH_FPGA
LITE_PRECISION_PROFILE(inst) precision_profiler_summary +=
inst_precision_profiler.GetInstPrecision(&inst);
#endif #endif
#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
} }
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
#ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary;
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
} }
......
...@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten ...@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc #lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
# DEPS conv_opencl op_registry program context) # DEPS conv_opencl op_registry program context)
#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
# DEPS tensor cl_context cl_wrapper cl_target_wrapper)
#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc #lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
# DEPS depthwise_conv2d_opencl op_registry program context) # DEPS depthwise_conv2d_opencl op_registry program context)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册