diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index a02a0d585ec0f5ce82ed45c306c941a019925935..b4644e652f2c2a82660e6dfd31bd74f3208383b0 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -12,6 +12,7 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" +#include "mace/public/mace_runtime.h" #include "mace/utils/logging.h" #include "mace/benchmark/stat_summarizer.h" @@ -95,9 +96,23 @@ inline int64_t NowMicros() { return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; } +DeviceType ParseDeviceType(const std::string &device_str) { + if (device_str.compare("CPU") == 0) { + return DeviceType::CPU; + } else if (device_str.compare("NEON") == 0) { + return DeviceType::NEON; + } else if (device_str.compare("OPENCL") == 0) { + return DeviceType::OPENCL; + } else if (device_str.compare("HEXAGON") == 0) { + return DeviceType::HEXAGON; + } else { + return DeviceType::CPU; + } +} + bool RunInference(MaceEngine *engine, - const std::vector &input_infos, - std::map *output_infos, + const std::map &input_infos, + std::map *output_infos, StatSummarizer *summarizer, int64_t *inference_time_us) { MACE_CHECK_NOTNULL(output_infos); @@ -106,28 +121,16 @@ bool RunInference(MaceEngine *engine, if (summarizer) { run_metadata_ptr = &run_metadata; } - if (input_infos.size() == 1 && output_infos->size() == 1) { - const int64_t start_time = NowMicros(); - bool s = engine->Run(input_infos[0].data, input_infos[0].shape, - output_infos->begin()->second, run_metadata_ptr); - const int64_t end_time = NowMicros(); - if (!s) { - LOG(ERROR) << "Error during inference."; - return s; - } - *inference_time_us = end_time - start_time; - } else { - const int64_t start_time = NowMicros(); - bool s = engine->Run(input_infos, *output_infos, run_metadata_ptr); - const int64_t end_time = NowMicros(); + const int64_t start_time = NowMicros(); + mace::MaceStatus s = engine->Run(input_infos, output_infos, run_metadata_ptr); + const int64_t end_time = NowMicros(); - if (!s) { - LOG(ERROR) << "Error during inference."; - return s; - } - *inference_time_us = end_time - start_time; + if (s != mace::MaceStatus::MACE_SUCCESS) { + LOG(ERROR) << "Error during inference."; + return false; } + *inference_time_us = end_time - start_time; if (summarizer != nullptr) { summarizer->ProcessMetadata(run_metadata); @@ -137,8 +140,8 @@ bool RunInference(MaceEngine *engine, } bool Run(MaceEngine *engine, - const std::vector &input_infos, - std::map *output_infos, + const std::map &input_infos, + std::map *output_infos, StatSummarizer *summarizer, int num_runs, double max_time_sec, @@ -261,12 +264,7 @@ int Main(int argc, char **argv) { stats_options.show_summary = FLAGS_show_summary; stats.reset(new StatSummarizer(stats_options)); - DeviceType device_type = CPU; - if (FLAGS_device == "OPENCL") { - device_type = OPENCL; - } else if (FLAGS_device == "NEON") { - device_type = NEON; - } + mace::DeviceType device_type = ParseDeviceType(FLAGS_device); // config runtime mace::ConfigOmpThreads(FLAGS_omp_num_threads); @@ -302,50 +300,45 @@ int Main(int argc, char **argv) { mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); - std::vector input_infos(input_count); - std::map output_infos; - std::vector> input_datas(input_count); - std::vector> output_datas(output_count); - + std::map inputs; + std::map outputs; for (size_t i = 0; i < input_count; ++i) { - int64_t input_size = std::accumulate(input_shape_vec[i].begin(), - input_shape_vec[i].end(), 1, - std::multiplies()); - input_datas[i].reset(new float[input_size]); + // Allocate input and output + int64_t input_size = + std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1, + std::multiplies()); + auto buffer_in = std::shared_ptr(new float[input_size], + std::default_delete()); // load input std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary); if (in_file.is_open()) { - in_file.read(reinterpret_cast(input_datas[i].get()), + in_file.read(reinterpret_cast(buffer_in.get()), input_size * sizeof(float)); in_file.close(); } else { LOG(INFO) << "Open input file failed"; return -1; } - - input_infos[i].name = input_names[i]; - input_infos[i].shape = input_shape_vec[i]; - input_infos[i].data = input_datas[i].get(); + inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in); } + for (size_t i = 0; i < output_count; ++i) { - int64_t output_size = std::accumulate(output_shape_vec[i].begin(), - output_shape_vec[i].end(), 1, - std::multiplies()); - output_datas[i].reset(new float[output_size]); - output_infos[output_names[i]] = output_datas[i].get(); + int64_t output_size = + std::accumulate(output_shape_vec[i].begin(), + output_shape_vec[i].end(), 1, + std::multiplies()); + auto buffer_out = std::shared_ptr(new float[output_size], + std::default_delete()); + outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i], + buffer_out); } // Init model LOG(INFO) << "Run init"; - std::unique_ptr engine_ptr; - if (input_count == 1 && output_count == 1) { - engine_ptr.reset(new mace::MaceEngine(&net_def, device_type)); - } else { - engine_ptr.reset(new mace::MaceEngine(&net_def, device_type, - input_names, output_names)); - } - if (device_type == DeviceType::OPENCL) { + std::unique_ptr engine_ptr( + new mace::MaceEngine(&net_def, device_type, input_names, output_names)); + if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { mace::MACE_MODEL_TAG::UnloadModelData(model_data); } @@ -355,7 +348,7 @@ int Main(int argc, char **argv) { int64_t num_warmup_runs = 0; if (FLAGS_warmup_runs > 0) { bool status = - Run(engine_ptr.get(), input_infos, &output_infos, nullptr, + Run(engine_ptr.get(), inputs, &outputs, nullptr, FLAGS_warmup_runs, -1.0, inter_inference_sleep_seconds, &warmup_time_us, &num_warmup_runs); if (!status) { @@ -370,7 +363,7 @@ int Main(int argc, char **argv) { int64_t no_stat_time_us = 0; int64_t no_stat_runs = 0; bool status = - Run(engine_ptr.get(), input_infos, &output_infos, + Run(engine_ptr.get(), inputs, &outputs, nullptr, FLAGS_max_num_runs, max_benchmark_time_seconds, inter_inference_sleep_seconds, &no_stat_time_us, &no_stat_runs); if (!status) { @@ -379,7 +372,7 @@ int Main(int argc, char **argv) { int64_t stat_time_us = 0; int64_t stat_runs = 0; - status = Run(engine_ptr.get(), input_infos, &output_infos, + status = Run(engine_ptr.get(), inputs, &outputs, stats.get(), FLAGS_max_num_runs, max_benchmark_time_seconds, inter_inference_sleep_seconds, &stat_time_us, &stat_runs); if (!status) { diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 01f370c51eb3a882d2c93fc5005e20e46d9b27f6..0728d5f0905af0b367fb77df4b986e3f9256ff74 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -480,12 +480,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { } const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() { - if (gpu_type_ == GPUType::QUALCOMM_ADRENO && - opencl_version_ == "2.0") { - return true; - } else { - return false; - } + return (gpu_type_ == GPUType::QUALCOMM_ADRENO && + opencl_version_ == "2.0"); +} + +const GPUType OpenCLRuntime::gpu_type() const { + return gpu_type_; } const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 3814eb41e8551363150f3ed3fb777f8d6ea73300..3f5261b860bf43a91867214b476edab4ff009e50 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -66,6 +66,7 @@ class OpenCLRuntime { uint64_t GetKernelWaveSize(const cl::Kernel &kernel); const bool IsNonUniformWorkgroupsSupported(); const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name); + const GPUType gpu_type() const; cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); diff --git a/mace/kernels/cwise.h b/mace/kernels/cwise.h index 073f5c48dbbd6d576acc2e9c39492b7522af2b38..07e03e7ff2d77cf0b907c9d88ee5bff221a96f80 100644 --- a/mace/kernels/cwise.h +++ b/mace/kernels/cwise.h @@ -114,6 +114,7 @@ struct CWiseFunctor : CWiseFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 42afc7012528242475b3fc61a8a9bdfdb5623772..2978f4022e67ffa13b3e318bcd75490ae66a8d1b 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -1,7 +1,6 @@ #include -__kernel void activation( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, #ifdef USE_PRELU __read_only image2d_t alpha, diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index d0604f9ed074c5a5d2729fd8c66751d9ab7b751b..30f52247f22e95718239e60e956bf856eba65f39 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,7 +1,6 @@ #include -__kernel void addn( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #if INPUT_NUM > 2 diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 0075932dbac599780803ac7041da293dfbbc1447..290b6c1a12216d0771bcfb65b6c81660e4e59833 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,7 +1,6 @@ #include // Supported data types: half/float -__kernel void batch_norm( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index a2d99abcc8e21e19e0710db8f752df3a6032d56f..64de2d77ff8371c6acafcea1bb4afdc00b105a8d 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -1,7 +1,6 @@ #include // Supported data types: half/float -__kernel void bias_add( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t output) { diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 86071708117efe6a7d4f0580d0324e2ad0701962..e300bc51e2e429bd2c232068cba976269a2081d6 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -1,7 +1,6 @@ #include -__kernel void filter_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* h, w, oc, ic */ __private const int input_offset, __private const int filter_h, @@ -53,8 +52,7 @@ __kernel void filter_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void filter_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* h, w, oc, ic */ __private const int filter_h, __private const int filter_w, @@ -102,8 +100,7 @@ __kernel void filter_image_to_buffer( } } -__kernel void dw_filter_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* h, w, ic, m */ __private const int input_offset, __private const int filter_w, @@ -160,8 +157,7 @@ __kernel void dw_filter_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void in_out_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -202,8 +198,7 @@ __kernel void in_out_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void in_out_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* nhwc */ __private const int height, __private const int width, @@ -242,8 +237,7 @@ __kernel void in_out_image_to_buffer( } } -__kernel void arg_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int count, @@ -278,8 +272,7 @@ __kernel void arg_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void arg_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* nhwc */ __private const int count, __read_only image2d_t input) { @@ -312,8 +305,7 @@ __kernel void arg_image_to_buffer( } -__kernel void in_out_height_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //nhwc __private const int input_offset, __private const int height, @@ -355,8 +347,7 @@ __kernel void in_out_height_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void in_out_height_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //nhwc __private const int height, __private const int width, @@ -394,8 +385,7 @@ __kernel void in_out_height_image_to_buffer( } -__kernel void in_out_width_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -437,8 +427,7 @@ __kernel void in_out_width_buffer_to_image( } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, __private const int in_channels, @@ -529,8 +518,7 @@ __kernel void winograd_filter_buffer_to_image( } // only support 3x3 now -__kernel void winograd_filter_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, __private const int width, diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index 3fa2894e8bf60b8e7528ccd2562fc179afd9f46e..92ff94473d7964925f67bfc3517194dc1111a4ee 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -1,8 +1,7 @@ #include // assume channes_per_group mod 4 = 0 && groups mod 4 == 0 -__kernel void channel_shuffle( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int groups, __private const int channels_per_group, diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 6e698b5c61c7a7940f58440baf9e48f13c9f34da..b68bca075491fba94732c9ab998751284a232a55 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -19,18 +19,18 @@ #ifndef NON_UNIFORM_WORK_GROUP -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \ +#define GLOBAL_WORK_GROUP_SIZE_DIM2 \ __private const int global_size_dim0, \ __private const int global_size_dim1, -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \ +#define GLOBAL_WORK_GROUP_SIZE_DIM3 \ __private const int global_size_dim0, \ __private const int global_size_dim1, \ __private const int global_size_dim2, #else -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +#define GLOBAL_WORK_GROUP_SIZE_DIM2 +#define GLOBAL_WORK_GROUP_SIZE_DIM3 #endif diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index 3b7370a8a30ba21a0c22305d1ef84e66314d7153..0e171e0f4dac7bbd41856e1a43518aacbb6ffa12 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -22,8 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, } // Supported data type: half/float -__kernel void concat_channel( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input0, __read_only image2d_t input1, __private const int input0_chan, @@ -84,8 +83,7 @@ __kernel void concat_channel( } // Required: All input channels are divisible by 4 -__kernel void concat_channel_multi( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index 1383557d89d96b4a282773f16dabbae59b7b798b..f88885b06c7b6a4ac9efb91a950e6e94b4e54076 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -1,7 +1,6 @@ #include -__kernel void conv_2d( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index e993a159e3e82fa5c110881647ffb290b75c4832..a5454a6700eeb78573994833f9fa1bc3bb0029ff 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,7 +1,6 @@ #include -__kernel void conv_2d_1x1( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 8bc27b33569109fad1e9207c910299c6ebcaac0b..19a636bb109d6b9fb04577548c0c72e0824f0e88 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,7 +1,6 @@ #include -__kernel void conv_2d_3x3( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/cwise.cl b/mace/kernels/opencl/cl/cwise.cl index 16f1f0851f98abdae95fb936a7ee6c4f449d0b96..92cdaf7ea20ef7e77467a52b494b3c72506269c1 100644 --- a/mace/kernels/opencl/cl/cwise.cl +++ b/mace/kernels/opencl/cl/cwise.cl @@ -1,11 +1,16 @@ #include -__kernel void cwise(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ - __private const float value, - __write_only image2d_t output) { +__kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __private const float value, + __write_only image2d_t output) { const int w = get_global_id(0); const int hb = get_global_id(1); +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || hb >= global_size_dim1) return; +#endif + DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = (DATA_TYPE4){value, value, value, value}; DATA_TYPE4 out; diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 21045ec94fe2d2eac962294fb09bdf3041e20e49..8d989290b6496bd8e1f4797f711e6002fbc189d8 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -1,7 +1,6 @@ #include -__kernel void depth_to_space( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void depth_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, __private const int input_height, @@ -36,7 +35,7 @@ __kernel void depth_to_space( } __kernel void space_to_depth( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, __private const int input_height, diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index fff19613c9dfad3f3e4a80fed57c60e99d1ec43f..c71ec4049ab6218c4d22ee446e371ba8c3622cab 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -1,8 +1,7 @@ #include // Only multiplier = 1 is supported -__kernel void depthwise_conv2d( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS @@ -138,8 +137,7 @@ __kernel void depthwise_conv2d( WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } -__kernel void depthwise_conv2d_s1( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index 8509dc38286454d26ae46d85f82407a9c346e84f..0b9647f50574c7522ce54631980cb21c01038361 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -1,7 +1,6 @@ #include -__kernel void eltwise( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #ifdef COEFF_SUM diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl index 057a66a4ca6514fb991870f65c813ffcedb75623..3205e4921b4da3e44bbb5aa7cfa871ef3bad2ec1 100644 --- a/mace/kernels/opencl/cl/fully_connected.cl +++ b/mace/kernels/opencl/cl/fully_connected.cl @@ -1,7 +1,8 @@ #include // output = weight * input + bias -__kernel void fully_connected(__read_only image2d_t input, +__kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, __read_only image2d_t weight, #ifdef BIAS __read_only image2d_t bias, @@ -15,6 +16,10 @@ __kernel void fully_connected(__read_only image2d_t input, const int out_blk_idx = get_global_id(1); const int input_chan_blk = (input_channel + 3) >> 2; +#ifndef NON_UNIFORM_WORK_GROUP + if (batch_idx >= global_size_dim0 || out_blk_idx >= global_size_dim1) return; +#endif + float4 input_value; float4 w0, w1, w2, w3; @@ -57,7 +62,8 @@ __kernel void fully_connected(__read_only image2d_t input, } // output = weight * input + bias -__kernel void fully_connected_width(__read_only image2d_t input, +__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, __read_only image2d_t weight, #ifdef BIAS __read_only image2d_t bias, @@ -73,6 +79,7 @@ __kernel void fully_connected_width(__read_only image2d_t input, const int width_blk_idx = get_global_id(1); const int width_blk_count = get_global_size(1); const int batch_out_blk_idx = get_global_id(2); + const int batch_idx = batch_out_blk_idx / out_blks; const int out_blk_idx = batch_out_blk_idx % out_blks; @@ -115,6 +122,16 @@ __kernel void fully_connected_width(__read_only image2d_t input, short inter_idx = mad24((short)get_local_id(2), local_size, inter_out_offset); intermediate_output[inter_idx] = sum; +#ifdef NON_QUALCOMM_ADRENO + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#ifndef NON_UNIFORM_WORK_GROUP + if (batch_out_blk_idx >= global_size_dim2) { + return; + } +#endif + if (inter_out_offset == 0) { #ifdef BIAS DATA_TYPE4 result = READ_IMAGET(bias, SAMPLER, (int2)(out_blk_idx, 0)); @@ -122,7 +139,7 @@ __kernel void fully_connected_width(__read_only image2d_t input, DATA_TYPE4 result = (DATA_TYPE4)(0, 0, 0, 0); #endif - for(short i = 0; i < local_width_blk_size; ++i) { + for (short i = 0; i < local_width_blk_size; ++i) { result += vload4(0, intermediate_output+inter_idx); inter_idx += 4; } diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl index fe260e7a22477ea958936b30378b439c8c94fb2f..82ccf6bad13a718f5777957fbebb44fd8e8b14df 100644 --- a/mace/kernels/opencl/cl/matmul.cl +++ b/mace/kernels/opencl/cl/matmul.cl @@ -1,8 +1,7 @@ #include // C = A * B -__kernel void matmul( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t A, __read_only image2d_t B, __write_only image2d_t C, diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index ead839940e5081e6d90e841f3eda569339a2ffa1..25785bb2a089eb4aed28e46fc434403fc365f1bc 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -19,8 +19,7 @@ inline int calculate_avg_block_size(const int pool_size, } // Supported data type: half/float -__kernel void pooling( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int in_height, __private const int in_width, diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 83e6df85c5c7c1c4b1ee9facf62d73c7cd0f5a58..2b0464c70d2042908345a1b360af8c5a4d91a15c 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,7 +1,6 @@ #include -__kernel void resize_bilinear_nocache( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __write_only image2d_t output, __private const float height_scale, diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index eccdd882c75a809804d61599b5288a432d2d432e..0692c62b70f4ed0933bb94627c16413d2602d2ab 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -1,7 +1,6 @@ #include -__kernel void slice( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index 628d71cbb3a22b4ca240446c7e7c889dc1fc55ab..b5b99de6c0ef92064174a14e16014ddb22093eb9 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -1,7 +1,6 @@ #include -__kernel void softmax( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int channels, __private const int remain_channels, diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index 1e2024043f97f835c03c41ecc03fea7b86617ed2..431a599705b7a522ff4e366c5c73a173b38d9673 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,7 +1,6 @@ #include -__kernel void space_to_batch( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t space_data, __write_only image2d_t batch_data, __private const int block_height, @@ -48,8 +47,7 @@ __kernel void space_to_batch( WRITE_IMAGET(batch_data, batch_coord, value); } -__kernel void batch_to_space( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t batch_data, __write_only image2d_t space_data, __private const int block_height, diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index d447001e6ccd09f44f2d60c658be778c0e1fbff9..0cab37d750510f1f7bedb02ceddfb49577e4ee31 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -1,7 +1,6 @@ #include -__kernel void winograd_transform_2x2( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __write_only image2d_t output, __private const int in_height, @@ -116,8 +115,7 @@ __kernel void winograd_transform_2x2( } } -__kernel void winograd_inverse_transform_2x2( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc index bd839c556ede14ffce77e689f0d9476f3134e40e..dce3d14d69f7a60f4a9cc928b7b2e12a1cbc8c73 100644 --- a/mace/kernels/opencl/cwise_opencl.cc +++ b/mace/kernels/opencl/cwise_opencl.cc @@ -23,8 +23,10 @@ void CWiseFunctor::operator()(const Tensor *input, const index_t width_pixels = channel_blocks * width; const index_t batch_height_pixels = batch * height; + auto runtime = OpenCLRuntime::Global(); + const uint32_t gws[2] = {static_cast(width_pixels), + static_cast(batch_height_pixels)}; if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("cwise"); @@ -32,19 +34,27 @@ void CWiseFunctor::operator()(const Tensor *input, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DCWISE_TYPE=", type_)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("cwise", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(coeff_)); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); } - const uint32_t gws[2] = {static_cast(width_pixels), - static_cast(batch_height_pixels)}; - const std::vector lws = {64, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 1}; std::stringstream ss; ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc index 70af952ee8da6f7cd475f9c9aaa4020ac44f20e5..3178b8ae1125e1dcc6b5efd5daebe4fc73df2168 100644 --- a/mace/kernels/opencl/fully_connected_opencl.cc +++ b/mace/kernels/opencl/fully_connected_opencl.cc @@ -27,6 +27,10 @@ void FCWXKernel(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { + const index_t batch = output->dim(0); + const index_t output_size = output->dim(3); + const index_t output_blocks = RoundUpDiv4(output_size); + std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); @@ -55,28 +59,47 @@ void FCWXKernel(cl::Kernel *kernel, default: LOG(FATAL) << "Unknown activation type: " << activation; } + if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_QUALCOMM_ADRENO"); + } + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } *kernel = runtime->BuildKernel("fully_connected", kernel_name, built_options); - const index_t batch = output->dim(0); - const index_t output_size = output->dim(3); - const index_t output_blocks = RoundUpDiv4(output_size); - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(*kernel)); + if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + const uint32_t wave_size = + static_cast(runtime->GetKernelWaveSize(*kernel)); - *gws = {4, (wave_size / 4), static_cast(batch * output_blocks)}; + *gws = {4, (wave_size / 4), static_cast(batch * output_blocks)}; - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); - *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); + *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; + } else { + *gws = {4, 8, static_cast(batch * output_blocks)}; + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); + *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; + } } if (!IsVecEqual(*prev_input_shape, input->shape())) { const index_t batch = output->dim(0); const index_t output_blocks = RoundUpDiv4(output->dim(3)); + (*gws)[2] = static_cast(batch * output_blocks); uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, (*gws)[0]); + kernel->setArg(idx++, (*gws)[1]); + kernel->setArg(idx++, (*gws)[2]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -91,15 +114,25 @@ void FCWXKernel(cl::Kernel *kernel, kernel->setArg(idx++, static_cast(output_blocks)); kernel->setArg(idx++, relux_max_limit); - (*gws)[2] = static_cast(batch * output_blocks); - *prev_input_shape = input->shape(); } cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]), - cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]), + cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws->size()); + for (size_t i = 0; i < lws->size(); ++i) { + roundup_gws[i] = RoundUp((*gws)[i], (*lws)[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); + } + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { @@ -125,8 +158,8 @@ void FCWTXKernel(cl::Kernel *kernel, StatsFuture *future) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); + auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); @@ -136,6 +169,9 @@ void FCWTXKernel(cl::Kernel *kernel, if (bias != nullptr) { built_options.emplace("-DBIAS"); } + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } switch (activation) { case NOOP: break; @@ -157,10 +193,23 @@ void FCWTXKernel(cl::Kernel *kernel, *kernel = runtime->BuildKernel("fully_connected", kernel_name, built_options); - *lws = {16, 64, 1}; + uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + *lws = {16, kwg_size/16, 1}; } if (!IsVecEqual(*prev_input_shape, input->shape())) { + const index_t batch = output->dim(0); + const index_t output_blocks = RoundUpDiv4(output->dim(3)); + + *gws = { + static_cast(batch), static_cast(output_blocks), + }; + uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, (*gws)[0]); + kernel->setArg(idx++, (*gws)[1]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -173,12 +222,6 @@ void FCWTXKernel(cl::Kernel *kernel, // FIXME handle flexable data type: half not supported kernel->setArg(idx++, relux_max_limit); - const index_t batch = output->dim(0); - const index_t output_blocks = RoundUpDiv4(output->dim(3)); - - *gws = { - static_cast(batch), static_cast(output_blocks), - }; *prev_input_shape = input->shape(); } diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index d401da97f587fb9ea15fdc4bcac424f6a2830ae4..c3bca21ff9c2e34728a922be3644199288298485 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -57,7 +57,6 @@ void SimpleValidTest() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 26a893b25cf77f0a7af49b7379693729a2410de4..f839f95c975f94545d3661642de0ffc762c2a27d 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -225,7 +225,7 @@ void TestWXFormat(const index_t batch, kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Weight", "WeightImage", kernels::BufferType::WEIGHT_WIDTH); - BufferToImage(&net, "Bias", "BiasImage", + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FC", "FullyConnectedTest") @@ -236,7 +236,7 @@ void TestWXFormat(const index_t batch, .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - // Run on opencl + // Run net.RunOp(DeviceType::OPENCL); ImageToBuffer(&net, "OutputImage", "OPENCLOutput", diff --git a/tools/benchmark.sh b/tools/benchmark.sh index a546ca0e1281d301f0900f396377dee722bf100b..d4d8dcbfcfc44d39c369715086308574278a2282 100644 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -x Usage() { echo "Usage: bash tools/benchmark.sh target_soc model_output_dir option_args" }