提交 e54825c5 编写于 作者: L Liangliang He

Merge branch 'concat-dead' into 'master'

Fix concat test bad performance.

See merge request !358
......@@ -236,7 +236,7 @@ void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
GPUPriorityHint gpu_priority_hint):
storage_(nullptr) {
storage_(nullptr), is_profiling_enabled_(false) {
LoadOpenCLLibrary();
std::vector<cl::Platform> all_platforms;
......@@ -286,6 +286,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
if (Tuner<uint32_t>::Get()->IsTuning() ||
(profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
properties |= CL_QUEUE_PROFILING_ENABLE;
is_profiling_enabled_ = true;
}
cl_int err;
......@@ -590,4 +591,8 @@ const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
return out_of_range_check_;
}
const bool OpenCLRuntime::is_profiling_enabled() const {
return is_profiling_enabled_;
}
} // namespace mace
......@@ -77,6 +77,7 @@ class OpenCLRuntime {
const GPUType ParseGPUType(const std::string &device_name);
const std::string ParseDeviceVersion(const std::string &device_version);
void SaveBuiltCLProgram();
const bool is_profiling_enabled() const;
private:
OpenCLRuntime(GPUPerfHint, GPUPriorityHint);
......@@ -116,6 +117,7 @@ class OpenCLRuntime {
std::string platform_info_;
bool program_map_changed_;
std::unique_ptr<KVStorage> storage_;
bool is_profiling_enabled_;
static GPUPerfHint kGPUPerfHint;
static GPUPriorityHint kGPUPriorityHint;
......
......@@ -168,6 +168,11 @@ class OpenCLLibraryImpl final {
size_t,
void *,
size_t *);
using clGetEventInfoFunc = cl_int (*)(cl_event event,
cl_event_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
cl_profiling_info param_name,
size_t param_value_size,
......@@ -221,6 +226,7 @@ class OpenCLLibraryImpl final {
MACE_CL_DEFINE_FUNC_PTR(clReleaseDevice);
MACE_CL_DEFINE_FUNC_PTR(clRetainEvent);
MACE_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
MACE_CL_DEFINE_FUNC_PTR(clGetEventInfo);
MACE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo);
MACE_CL_DEFINE_FUNC_PTR(clGetImageInfo);
......@@ -344,6 +350,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseDevice);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainEvent);
MACE_CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clGetEventInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo);
......@@ -881,6 +888,21 @@ CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
return func(event);
}
// Event API
CL_API_ENTRY cl_int clGetEventInfo(cl_event event,
cl_event_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret)
CL_API_SUFFIX__VERSION_1_0 {
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetEventInfo;
MACE_CHECK_NOTNULL(func);
MACE_LATENCY_LOGGER(3, "clGetEventInfo");
return func(event, param_name, param_value_size, param_value,
param_value_size_ret);
}
// Profiling APIs
CL_API_ENTRY cl_int clGetEventProfilingInfo(cl_event event,
cl_profiling_info param_name,
......
......@@ -137,6 +137,9 @@ static void ConcatN(cl::Kernel *kernel,
const int inputs_count = input_list.size();
index_t chan_blk_offset = 0;
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
for (int i = 0; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
index_t input_channel_blk = input->dim(3) / 4;
......@@ -160,18 +163,45 @@ static void ConcatN(cl::Kernel *kernel,
kernel->setArg(idx++, *(output->opencl_image()));
chan_blk_offset += input_channel_blk;
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::stringstream ss;
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< batch * height;
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
if (runtime->is_profiling_enabled()) {
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros,
call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (future != nullptr) {
future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
event.wait();
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
}
......
......@@ -63,13 +63,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << "slice_opencl_kernel_"
<< input->dim(0) << "_"
<< input->dim(1) << "_"
<< input->dim(2) << "_"
<< input_channels << "_"
<< outputs_count;
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (int i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -85,13 +80,45 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (runtime->is_profiling_enabled()) {
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros,
call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (future != nullptr) {
future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
event.wait();
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
}
......
......@@ -151,12 +151,17 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
int num_inputs = shapes.size();
int concat_axis_size = 0;
// Construct graph
std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>());
std::vector<const float*> input_ptrs(num_inputs);
OpsTestNet net;
for (int i = 0; i < num_inputs; ++i) {
const std::string input_name = MakeString("Input", i);
const std::string image_name = MakeString("InputImage", i);
concat_axis_size += shapes[i][axis];
net.AddRandomInput<DeviceType::OPENCL, float>(input_name, shapes[i]);
GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::OPENCL, float>(input_name,
shapes[i], inputs[i]);
BufferToImage<DeviceType::OPENCL, T>(&net, input_name, image_name,
kernels::BufferType::IN_OUT_CHANNEL);
}
......@@ -186,17 +191,15 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
Tensor::MappingGuard output_mapper(output);
const float *output_ptr = output->data<float>();
const float *output_ptr_end = output_ptr + output->size();
int k = 0;
while (output_ptr != (output->data<float>() + output->size())) {
while (output_ptr != output_ptr_end) {
for (int i = 0; i < num_inputs; ++i) {
index_t num_elements =
std::accumulate(shapes[i].begin() + axis, shapes[i].end(), 1,
std::multiplies<index_t>());
const std::string input_name = MakeString("Input", i);
const Tensor *input_tensor = net.GetTensor(input_name.data());
Tensor::MappingGuard input_guard(input_tensor);
const float *input_ptr = input_tensor->data<float>() + k * num_elements;
const float *input_ptr = input_ptrs[i] + k * num_elements;
for (int j = 0; j < num_elements; ++j) {
EXPECT_NEAR(*(input_ptr + j), *output_ptr++, 1e-2)
<< "With index: " << i << ", " << j;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册