提交 d63e484f 编写于 作者: L liutuo

deconv support variable output shape

上级 c2b06975
......@@ -231,6 +231,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
MaceStatus operator()(const Tensor *input, // NCHW
const Tensor *filter, // OIHW
const Tensor *bias,
const Tensor *output_shape_tensor,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
......@@ -240,10 +241,20 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
if (!from_caffe_) { // tensorflow
std::vector<index_t> output_shape(4);
output_shape[0] = output_shape_[0];
output_shape[1] = output_shape_[3];
output_shape[2] = output_shape_[1];
output_shape[3] = output_shape_[2];
if (output_shape_.size() == 4) {
output_shape[0] = output_shape_[0];
output_shape[1] = output_shape_[3];
output_shape[2] = output_shape_[1];
output_shape[3] = output_shape_[2];
} else {
MACE_CHECK_NOTNULL(output_shape_tensor);
MACE_CHECK(output_shape_tensor->size() == 4);
Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
auto output_shape_data =
output_shape_tensor->data<int32_t>();
output_shape =
std::vector<index_t>(output_shape_data, output_shape_data + 4);
}
paddings_.clear();
paddings_ = std::vector<int>(2, 0);
CalcDeconvPaddingAndInputSize(
......@@ -326,6 +337,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
MaceStatus operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const Tensor *output_shape_tensor,
Tensor *output,
StatsFuture *future);
......
......@@ -167,6 +167,7 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const Tensor *output_shape_tensor,
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
......@@ -174,6 +175,15 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
MACE_CHECK_NOTNULL(output);
if (!from_caffe_) {
if (output_shape_.size() != 4) {
MACE_CHECK_NOTNULL(output_shape_tensor);
MACE_CHECK(output_shape_tensor->size() == 4);
Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
auto output_shape_data =
output_shape_tensor->data<int32_t>();
output_shape_ =
std::vector<index_t>(output_shape_data, output_shape_data + 4);
}
paddings_.clear();
paddings_ = std::vector<int>(2, 0);
CalcDeconvPaddingAndInputSize(input->shape().data(), filter->shape().data(),
......
......@@ -40,17 +40,19 @@ class Deconv2dOp : public ConvPool2dOpBase<D, T> {
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
const Tensor *output_shape =
this->InputSize() >= 3 ? this->Input(OUTPUT_SHAPE) : nullptr;
const Tensor *bias = this->InputSize() >= 4 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
return functor_(input, filter, bias, output, future);
return functor_(input, filter, bias, output_shape, output, future);
}
private:
kernels::Deconv2dFunctor<D, T> functor_;
protected:
MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
MACE_OP_INPUT_TAGS(INPUT, FILTER, OUTPUT_SHAPE, BIAS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
......
......@@ -23,6 +23,19 @@ void Register_Shape(OperatorRegistryBase *op_registry) {
.TypeConstraint<float>("T")
.Build(),
ShapeOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ShapeOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ShapeOp<DeviceType::GPU, half>);
#endif
}
} // namespace ops
......
......@@ -151,6 +151,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
TFOpType.Div.name: self.convert_elementwise,
TFOpType.Min.name: self.convert_elementwise,
TFOpType.Max.name: self.convert_elementwise,
TFOpType.Maximum.name: self.convert_elementwise,
TFOpType.Neg.name: self.convert_elementwise,
TFOpType.Abs.name: self.convert_elementwise,
TFOpType.Pow.name: self.convert_elementwise,
......@@ -372,19 +373,21 @@ class TensorflowConverter(base_converter.ConverterInterface):
dilation_val = [1, 1]
dilation_arg.ints.extend(dilation_val)
else:
mace_check(len(tf_op.inputs) >= 3,
"deconv should have (>=) 3 inputs.")
output_shape_arg = op.arg.add()
output_shape_arg.name = MaceKeyword.mace_output_shape_str
if len(tf_op.inputs) >= 3:
del op.input[1:]
output_shape_value =\
if tf_op.inputs[0].op.type == TFOpType.Const.name:
output_shape_value = \
tf_op.inputs[0].eval().astype(np.int32).flat
output_shape_arg.ints.extend(output_shape_value)
self._skip_tensor.add(tf_op.inputs[0].name)
del op.input[0]
op.input.extend([tf_op.inputs[2].name, tf_op.inputs[1].name])
else:
output_shape_value = tf_op.get_attr(tf_strides_str)
output_shape_value = {}
output_shape_arg.ints.extend(output_shape_value)
del op.input[:]
op.input.extend([tf_op.inputs[2].name,
tf_op.inputs[1].name,
tf_op.inputs[0].name])
def convert_elementwise(self, tf_op):
op = self.convert_general_op(tf_op)
......
......@@ -980,8 +980,10 @@ class Transformer(base_converter.ConverterInterface):
if op.type == MaceOp.Conv2D.name \
or op.type == MaceOp.Deconv2D.name:
self.buffer_to_image(op, 1, OpenCLBufferType.CONV2D_FILTER)
if len(op.input) >= 3:
if len(op.input) >= 3 and op.type == MaceOp.Conv2D.name:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
elif len(op.input) >= 4 and op.type == MaceOp.Deconv2D.name:
self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.DepthwiseConv2d.name:
self.buffer_to_image(op, 1, OpenCLBufferType.DW_CONV2D_FILTER)
if len(op.input) >= 3:
......
......@@ -24,6 +24,7 @@ class MemoryOptimizer(object):
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[size] or mem_id->[x, y]
self.total_mem_count = 0
self.total_cpu_mem_count = 0
self.input_ref_counter = {}
self.mem_ref_counter = {}
......@@ -184,6 +185,15 @@ class GPUMemoryOptimizer(MemoryOptimizer):
for arg in op.arg:
if arg.name == 'mode' and arg.i == 0:
return False
elif op.type == 'Shape':
for i in range(len(op.output)):
mem_id = self.total_cpu_mem_count
self.total_cpu_mem_count += 1
op_mem_block = self.get_op_mem_block(
op.type,
op.output_shape[i].dims)
self.mem_block[mem_id] = op_mem_block
return False
return op.type != 'ImageToBuffer'
def get_op_mem_block(self, op_type, output_shape):
......@@ -191,13 +201,18 @@ class GPUMemoryOptimizer(MemoryOptimizer):
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_block[0] = output_shape[2]
mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
elif op_type == 'Shape':
mem_block[0] = output_shape[0]
mem_block[1] = 1
else:
if len(output_shape) == 2: # only support fc/softmax
mem_block[0] = int((output_shape[1] + 3) / 4)
mem_block[1] = output_shape[0]
else:
elif len(output_shape) == 4:
mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
mem_block[1] = output_shape[0] * output_shape[1]
else:
raise Exception('output shape dim size is not 2 or 4.')
return mem_block
def mem_size(self, memory_block):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册