提交 c8d5c88e 编写于 作者: L luxuhui

opt the performance of ResizeNearestNeighbor&Deconv OP

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 a2f49f02
...@@ -114,7 +114,8 @@ HexagonDSPWrapper::HexagonDSPWrapper() { ...@@ -114,7 +114,8 @@ HexagonDSPWrapper::HexagonDSPWrapper() {
if (env_log_execute_time_str.empty()) { if (env_log_execute_time_str.empty()) {
log_execute_time_ = false; log_execute_time_ = false;
} else { } else {
log_execute_time_ = static_cast<bool>(std::stoi(env_log_execute_time_str)); log_execute_time_ = static_cast<bool>(
std::atoi(env_log_execute_time_str.c_str()));
} }
} }
......
...@@ -74,8 +74,8 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -74,8 +74,8 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt, const DataType dt,
void **result) const { void **result) const {
MACE_CHECK(image_shape.size() == 2, "Image shape's size must equal 2"); MACE_CHECK(image_shape.size() == 2, "Image shape's size must equal 2");
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " MACE_LATENCY_LOGGER(1, "Allocate OpenCL image: ",
<< image_shape[1]; image_shape[0], ", ", image_shape[1]);
if (ShouldMockRuntimeFailure()) { if (ShouldMockRuntimeFailure()) {
return MaceStatus::MACE_OUT_OF_RESOURCES; return MaceStatus::MACE_OUT_OF_RESOURCES;
...@@ -109,7 +109,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -109,7 +109,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
} }
void OpenCLAllocator::Delete(void *buffer) const { void OpenCLAllocator::Delete(void *buffer) const {
VLOG(3) << "Free OpenCL buffer"; MACE_LATENCY_LOGGER(1, "Free OpenCL buffer");
if (buffer != nullptr) { if (buffer != nullptr) {
cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer); cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
delete cl_buffer; delete cl_buffer;
...@@ -117,7 +117,7 @@ void OpenCLAllocator::Delete(void *buffer) const { ...@@ -117,7 +117,7 @@ void OpenCLAllocator::Delete(void *buffer) const {
} }
void OpenCLAllocator::DeleteImage(void *buffer) const { void OpenCLAllocator::DeleteImage(void *buffer) const {
VLOG(3) << "Free OpenCL image"; MACE_LATENCY_LOGGER(1, "Free OpenCL image");
if (buffer != nullptr) { if (buffer != nullptr) {
cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer); cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
delete cl_image; delete cl_image;
...@@ -125,7 +125,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) const { ...@@ -125,7 +125,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
} }
void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
VLOG(3) << "Map OpenCL buffer"; MACE_LATENCY_LOGGER(1, "Map OpenCL buffer");
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = opencl_runtime_->command_queue(); auto queue = opencl_runtime_->command_queue();
// TODO(heliangliang) Non-blocking call // TODO(heliangliang) Non-blocking call
...@@ -144,7 +144,7 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { ...@@ -144,7 +144,7 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void *OpenCLAllocator::MapImage(void *buffer, void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch) const { std::vector<size_t> *mapped_image_pitch) const {
VLOG(3) << "Map OpenCL Image"; MACE_LATENCY_LOGGER(1, "Map OpenCL Image");
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image"; MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer); auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {{0, 0, 0}}; std::array<size_t, 3> origin = {{0, 0, 0}};
...@@ -164,7 +164,7 @@ void *OpenCLAllocator::MapImage(void *buffer, ...@@ -164,7 +164,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
} }
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
VLOG(3) << "Unmap OpenCL buffer/Image"; MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image");
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = opencl_runtime_->command_queue(); auto queue = opencl_runtime_->command_queue();
cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
......
...@@ -170,8 +170,8 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { ...@@ -170,8 +170,8 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
template<> template<>
class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase { class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public: public:
explicit Deconv2dOp(OpConstructContext *context) explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context),
: Deconv2dOpBase(context) { dim_(Operation::GetRepeatedArgs<index_t>("dim")) {
MemoryType mem_type = MemoryType::GPU_IMAGE; MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel>(); kernel_ = make_unique<opencl::image::Deconv2dKernel>();
...@@ -219,12 +219,16 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase { ...@@ -219,12 +219,16 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
std::vector<index_t> out_shape; std::vector<index_t> out_shape;
if (output_shape_tensor) { if (output_shape_tensor) {
Tensor::MappingGuard out_shape_guard(output_shape_tensor); if (dim_.size() < 2) {
MACE_CHECK(output_shape_tensor->size() == 4, Tensor::MappingGuard out_shape_guard(output_shape_tensor);
"output shape should be 4-dims"); MACE_CHECK(output_shape_tensor->size() == 4,
out_shape = "output shape should be 4-dims");
std::vector<index_t>(output_shape_tensor->data<int32_t>(), out_shape =
output_shape_tensor->data<int32_t>() + 4); std::vector<index_t>(output_shape_tensor->data<int32_t>(),
output_shape_tensor->data<int32_t>() + 4);
} else {
out_shape = dim_;
}
} }
std::vector<int> in_paddings; std::vector<int> in_paddings;
std::vector<int> out_paddings; std::vector<int> out_paddings;
...@@ -249,6 +253,7 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase { ...@@ -249,6 +253,7 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
} }
private: private:
std::vector<index_t> dim_;
std::unique_ptr<OpenCLDeconv2dKernel> kernel_; std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -25,14 +25,22 @@ MaceStatus ResizeNearestNeighborKernel::Compute( ...@@ -25,14 +25,22 @@ MaceStatus ResizeNearestNeighborKernel::Compute(
OpContext *context, OpContext *context,
const Tensor *input, const Tensor *input,
const Tensor *size, const Tensor *size,
const std::vector<index_t> &dims,
Tensor *output) { Tensor *output) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t in_height = input->dim(1); const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2); const index_t in_width = input->dim(2);
const index_t channels = input->dim(3); const index_t channels = input->dim(3);
Tensor::MappingGuard size_mapper(size); index_t out_height = 0;
const index_t out_height = size->data<int32_t>()[0]; index_t out_width = 0;
const index_t out_width = size->data<int32_t>()[1]; if (dims.size() < 2) {
Tensor::MappingGuard size_mapper(size);
out_height = size->data<int32_t>()[0];
out_width = size->data<int32_t>()[1];
} else {
out_height = dims[0];
out_width = dims[1];
}
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -73,6 +73,7 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel { ...@@ -73,6 +73,7 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
OpContext *context, OpContext *context,
const Tensor *input, const Tensor *input,
const Tensor *size, const Tensor *size,
const std::vector<index_t> &dims,
Tensor *output) override; Tensor *output) override;
private: private:
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
#ifndef MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_ #ifndef MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
#define MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_ #define MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
#include <vector>
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/math.h" #include "mace/utils/math.h"
...@@ -31,6 +33,7 @@ class OpenCLResizeNearestNeighborKernel { ...@@ -31,6 +33,7 @@ class OpenCLResizeNearestNeighborKernel {
OpContext *context, OpContext *context,
const Tensor *input, const Tensor *input,
const Tensor *size, const Tensor *size,
const std::vector<index_t> &dims,
Tensor *output) = 0; Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeNearestNeighborKernel); MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeNearestNeighborKernel);
}; };
......
...@@ -145,7 +145,7 @@ template<> ...@@ -145,7 +145,7 @@ template<>
class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation { class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
public: public:
explicit ResizeNearestNeighborOp(OpConstructContext *context) explicit ResizeNearestNeighborOp(OpConstructContext *context)
: Operation(context) { : Operation(context), dim_(Operation::GetRepeatedArgs<index_t>("dim")) {
bool align_corners = Operation::GetOptionalArg<bool>( bool align_corners = Operation::GetOptionalArg<bool>(
"align_corners", false); "align_corners", false);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
...@@ -163,10 +163,11 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation { ...@@ -163,10 +163,11 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
"input must be 4-dimensional and size must be 1-dimensional.", "input must be 4-dimensional and size must be 1-dimensional.",
input->dim_size(), size->dim_size()); input->dim_size(), size->dim_size());
return kernel_->Compute(context, input, size, output); return kernel_->Compute(context, input, size, dim_, output);
} }
private: private:
std::vector<index_t> dim_;
std::unique_ptr<OpenCLResizeNearestNeighborKernel> kernel_; std::unique_ptr<OpenCLResizeNearestNeighborKernel> kernel_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -144,8 +144,9 @@ def convert(model_file, output_dir, layers): ...@@ -144,8 +144,9 @@ def convert(model_file, output_dir, layers):
output_info.data_format = data_format output_info.data_format = data_format
output_info.dims.extend(op.output_shape[i].dims) output_info.dims.extend(op.output_shape[i].dims)
output_info.data_type = mace_pb2.DT_FLOAT output_info.data_type = mace_pb2.DT_FLOAT
output_info.scale = op.quantize_info[0].scale if is_quantize:
output_info.zero_point = op.quantize_info[0].zero_point output_info.scale = op.quantize_info[0].scale
output_info.zero_point = op.quantize_info[0].zero_point
# modify output op # modify output op
if is_quantize: if is_quantize:
output_name = op.output[i] output_name = op.output[i]
......
...@@ -324,6 +324,7 @@ class TransformerRule(Enum): ...@@ -324,6 +324,7 @@ class TransformerRule(Enum):
FP16_MATMUL_WEIGHT = 41 FP16_MATMUL_WEIGHT = 41
FP16_GATHER_WEIGHT = 42 FP16_GATHER_WEIGHT = 42
QUANTIZE_LARGE_WEIGHTS = 43 QUANTIZE_LARGE_WEIGHTS = 43
TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44
class ConverterInterface(object): class ConverterInterface(object):
...@@ -534,6 +535,7 @@ class ConverterOption(object): ...@@ -534,6 +535,7 @@ class ConverterOption(object):
TransformerRule.TRANSFORM_LSTMCELL_ZEROSTATE, TransformerRule.TRANSFORM_LSTMCELL_ZEROSTATE,
TransformerRule.TRANSFORM_BASIC_LSTMCELL, TransformerRule.TRANSFORM_BASIC_LSTMCELL,
TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN, TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN,
TransformerRule.TRANSPOSE_SHAPE_TENSOR_TO_PARAM,
TransformerRule.FOLD_RESHAPE, TransformerRule.FOLD_RESHAPE,
TransformerRule.TRANSFORM_MATMUL_TO_FC, TransformerRule.TRANSFORM_MATMUL_TO_FC,
# For StoB -> conv -> BtoS -> BN pattern # For StoB -> conv -> BtoS -> BN pattern
......
...@@ -99,6 +99,8 @@ class Transformer(base_converter.ConverterInterface): ...@@ -99,6 +99,8 @@ class Transformer(base_converter.ConverterInterface):
TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format, TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format,
TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN: TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN:
self.transform_reshape_and_flatten, self.transform_reshape_and_flatten,
TransformerRule.TRANSPOSE_SHAPE_TENSOR_TO_PARAM:
self.transform_shape_tensor_to_param,
TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format, TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
TransformerRule.CHECK_QUANTIZE_INFO: TransformerRule.CHECK_QUANTIZE_INFO:
self.check_quantize_info, self.check_quantize_info,
...@@ -2119,9 +2121,21 @@ class Transformer(base_converter.ConverterInterface): ...@@ -2119,9 +2121,21 @@ class Transformer(base_converter.ConverterInterface):
mace_check(False, "Only support reshape and flatten") mace_check(False, "Only support reshape and flatten")
shape_tensor.int32_data.extend(dims) shape_tensor.int32_data.extend(dims)
op.input.append(shape_tensor.name) op.input.append(shape_tensor.name)
if len(op.input) == 2 and dim_arg is None:
if shape_tensor is None and op.input[1] in self._consts: def transform_shape_tensor_to_param(self):
shape_tensor = self._consts[op.input[1]] kOpTypeInputIdxMap = {
MaceOp.ResizeNearestNeighbor.name: 1,
MaceOp.Deconv2D.name: 2,
MaceOp.Reshape.name: 1,
}
net = self._model
for op in net.op:
if op.type not in kOpTypeInputIdxMap:
continue
shape_idx = kOpTypeInputIdxMap[op.type]
dim_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_dim_str)
if len(op.input) > shape_idx and dim_arg is None:
shape_tensor = self._consts[op.input[shape_idx]]
if shape_tensor is not None: if shape_tensor is not None:
dim_arg = op.arg.add() dim_arg = op.arg.add()
dim_arg.name = MaceKeyword.mace_dim_str dim_arg.name = MaceKeyword.mace_dim_str
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册