提交 c8d5c88e 编写于 作者: L luxuhui

opt the performance of ResizeNearestNeighbor&Deconv OP

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 a2f49f02
......@@ -114,7 +114,8 @@ HexagonDSPWrapper::HexagonDSPWrapper() {
if (env_log_execute_time_str.empty()) {
log_execute_time_ = false;
} else {
log_execute_time_ = static_cast<bool>(std::stoi(env_log_execute_time_str));
log_execute_time_ = static_cast<bool>(
std::atoi(env_log_execute_time_str.c_str()));
}
}
......
......@@ -74,8 +74,8 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt,
void **result) const {
MACE_CHECK(image_shape.size() == 2, "Image shape's size must equal 2");
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
<< image_shape[1];
MACE_LATENCY_LOGGER(1, "Allocate OpenCL image: ",
image_shape[0], ", ", image_shape[1]);
if (ShouldMockRuntimeFailure()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
......@@ -109,7 +109,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
}
void OpenCLAllocator::Delete(void *buffer) const {
VLOG(3) << "Free OpenCL buffer";
MACE_LATENCY_LOGGER(1, "Free OpenCL buffer");
if (buffer != nullptr) {
cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
delete cl_buffer;
......@@ -117,7 +117,7 @@ void OpenCLAllocator::Delete(void *buffer) const {
}
void OpenCLAllocator::DeleteImage(void *buffer) const {
VLOG(3) << "Free OpenCL image";
MACE_LATENCY_LOGGER(1, "Free OpenCL image");
if (buffer != nullptr) {
cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
delete cl_image;
......@@ -125,7 +125,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
}
void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
VLOG(3) << "Map OpenCL buffer";
MACE_LATENCY_LOGGER(1, "Map OpenCL buffer");
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = opencl_runtime_->command_queue();
// TODO(heliangliang) Non-blocking call
......@@ -144,7 +144,7 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch) const {
VLOG(3) << "Map OpenCL Image";
MACE_LATENCY_LOGGER(1, "Map OpenCL Image");
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {{0, 0, 0}};
......@@ -164,7 +164,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
}
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
VLOG(3) << "Unmap OpenCL buffer/Image";
MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image");
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = opencl_runtime_->command_queue();
cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
......
......@@ -170,8 +170,8 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
template<>
class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
public:
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context),
dim_(Operation::GetRepeatedArgs<index_t>("dim")) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel>();
......@@ -219,12 +219,16 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
std::vector<index_t> out_shape;
if (output_shape_tensor) {
Tensor::MappingGuard out_shape_guard(output_shape_tensor);
MACE_CHECK(output_shape_tensor->size() == 4,
"output shape should be 4-dims");
out_shape =
std::vector<index_t>(output_shape_tensor->data<int32_t>(),
output_shape_tensor->data<int32_t>() + 4);
if (dim_.size() < 2) {
Tensor::MappingGuard out_shape_guard(output_shape_tensor);
MACE_CHECK(output_shape_tensor->size() == 4,
"output shape should be 4-dims");
out_shape =
std::vector<index_t>(output_shape_tensor->data<int32_t>(),
output_shape_tensor->data<int32_t>() + 4);
} else {
out_shape = dim_;
}
}
std::vector<int> in_paddings;
std::vector<int> out_paddings;
......@@ -249,6 +253,7 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
}
private:
std::vector<index_t> dim_;
std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -25,14 +25,22 @@ MaceStatus ResizeNearestNeighborKernel::Compute(
OpContext *context,
const Tensor *input,
const Tensor *size,
const std::vector<index_t> &dims,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
Tensor::MappingGuard size_mapper(size);
const index_t out_height = size->data<int32_t>()[0];
const index_t out_width = size->data<int32_t>()[1];
index_t out_height = 0;
index_t out_width = 0;
if (dims.size() < 2) {
Tensor::MappingGuard size_mapper(size);
out_height = size->data<int32_t>()[0];
out_width = size->data<int32_t>()[1];
} else {
out_height = dims[0];
out_width = dims[1];
}
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -73,6 +73,7 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
OpContext *context,
const Tensor *input,
const Tensor *size,
const std::vector<index_t> &dims,
Tensor *output) override;
private:
......
......@@ -15,6 +15,8 @@
#ifndef MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
#define MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
#include <vector>
#include "mace/core/types.h"
#include "mace/public/mace.h"
#include "mace/utils/math.h"
......@@ -31,6 +33,7 @@ class OpenCLResizeNearestNeighborKernel {
OpContext *context,
const Tensor *input,
const Tensor *size,
const std::vector<index_t> &dims,
Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeNearestNeighborKernel);
};
......
......@@ -145,7 +145,7 @@ template<>
class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
public:
explicit ResizeNearestNeighborOp(OpConstructContext *context)
: Operation(context) {
: Operation(context), dim_(Operation::GetRepeatedArgs<index_t>("dim")) {
bool align_corners = Operation::GetOptionalArg<bool>(
"align_corners", false);
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
......@@ -163,10 +163,11 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
"input must be 4-dimensional and size must be 1-dimensional.",
input->dim_size(), size->dim_size());
return kernel_->Compute(context, input, size, output);
return kernel_->Compute(context, input, size, dim_, output);
}
private:
std::vector<index_t> dim_;
std::unique_ptr<OpenCLResizeNearestNeighborKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -144,8 +144,9 @@ def convert(model_file, output_dir, layers):
output_info.data_format = data_format
output_info.dims.extend(op.output_shape[i].dims)
output_info.data_type = mace_pb2.DT_FLOAT
output_info.scale = op.quantize_info[0].scale
output_info.zero_point = op.quantize_info[0].zero_point
if is_quantize:
output_info.scale = op.quantize_info[0].scale
output_info.zero_point = op.quantize_info[0].zero_point
# modify output op
if is_quantize:
output_name = op.output[i]
......
......@@ -324,6 +324,7 @@ class TransformerRule(Enum):
FP16_MATMUL_WEIGHT = 41
FP16_GATHER_WEIGHT = 42
QUANTIZE_LARGE_WEIGHTS = 43
TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44
class ConverterInterface(object):
......@@ -534,6 +535,7 @@ class ConverterOption(object):
TransformerRule.TRANSFORM_LSTMCELL_ZEROSTATE,
TransformerRule.TRANSFORM_BASIC_LSTMCELL,
TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN,
TransformerRule.TRANSPOSE_SHAPE_TENSOR_TO_PARAM,
TransformerRule.FOLD_RESHAPE,
TransformerRule.TRANSFORM_MATMUL_TO_FC,
# For StoB -> conv -> BtoS -> BN pattern
......
......@@ -99,6 +99,8 @@ class Transformer(base_converter.ConverterInterface):
TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format,
TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN:
self.transform_reshape_and_flatten,
TransformerRule.TRANSPOSE_SHAPE_TENSOR_TO_PARAM:
self.transform_shape_tensor_to_param,
TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
TransformerRule.CHECK_QUANTIZE_INFO:
self.check_quantize_info,
......@@ -2119,9 +2121,21 @@ class Transformer(base_converter.ConverterInterface):
mace_check(False, "Only support reshape and flatten")
shape_tensor.int32_data.extend(dims)
op.input.append(shape_tensor.name)
if len(op.input) == 2 and dim_arg is None:
if shape_tensor is None and op.input[1] in self._consts:
shape_tensor = self._consts[op.input[1]]
def transform_shape_tensor_to_param(self):
kOpTypeInputIdxMap = {
MaceOp.ResizeNearestNeighbor.name: 1,
MaceOp.Deconv2D.name: 2,
MaceOp.Reshape.name: 1,
}
net = self._model
for op in net.op:
if op.type not in kOpTypeInputIdxMap:
continue
shape_idx = kOpTypeInputIdxMap[op.type]
dim_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_dim_str)
if len(op.input) > shape_idx and dim_arg is None:
shape_tensor = self._consts[op.input[shape_idx]]
if shape_tensor is not None:
dim_arg = op.arg.add()
dim_arg.name = MaceKeyword.mace_dim_str
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册