diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 3444449d4c8b59d2a8a256e51ad351d2c78c2709..3369f4f4619b7541c4f1203cb2144f01e3642985 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -231,6 +231,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { MaceStatus operator()(const Tensor *input, // NCHW const Tensor *filter, // OIHW const Tensor *bias, + const Tensor *output_shape_tensor, Tensor *output, StatsFuture *future) { MACE_UNUSED(future); @@ -240,10 +241,20 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { if (!from_caffe_) { // tensorflow std::vector output_shape(4); - output_shape[0] = output_shape_[0]; - output_shape[1] = output_shape_[3]; - output_shape[2] = output_shape_[1]; - output_shape[3] = output_shape_[2]; + if (output_shape_.size() == 4) { + output_shape[0] = output_shape_[0]; + output_shape[1] = output_shape_[3]; + output_shape[2] = output_shape_[1]; + output_shape[3] = output_shape_[2]; + } else { + MACE_CHECK_NOTNULL(output_shape_tensor); + MACE_CHECK(output_shape_tensor->size() == 4); + Tensor::MappingGuard output_shape_mapper(output_shape_tensor); + auto output_shape_data = + output_shape_tensor->data(); + output_shape = + std::vector(output_shape_data, output_shape_data + 4); + } paddings_.clear(); paddings_ = std::vector(2, 0); CalcDeconvPaddingAndInputSize( @@ -326,6 +337,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { MaceStatus operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, + const Tensor *output_shape_tensor, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc index db5353d634614129beb0ac3ce497e1fcf2c8ce9b..e40ac3b3a4a53f67adcf7bef695015b99768e78c 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d_opencl.cc @@ -167,6 +167,7 @@ MaceStatus Deconv2dFunctor::operator()( const Tensor *input, const Tensor *filter, const Tensor *bias, + const Tensor *output_shape_tensor, Tensor *output, StatsFuture *future) { MACE_CHECK_NOTNULL(input); @@ -174,6 +175,15 @@ MaceStatus Deconv2dFunctor::operator()( MACE_CHECK_NOTNULL(output); if (!from_caffe_) { + if (output_shape_.size() != 4) { + MACE_CHECK_NOTNULL(output_shape_tensor); + MACE_CHECK(output_shape_tensor->size() == 4); + Tensor::MappingGuard output_shape_mapper(output_shape_tensor); + auto output_shape_data = + output_shape_tensor->data(); + output_shape_ = + std::vector(output_shape_data, output_shape_data + 4); + } paddings_.clear(); paddings_ = std::vector(2, 0); CalcDeconvPaddingAndInputSize(input->shape().data(), filter->shape().data(), diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index cd4a44e621730d22875b610d160007e0da96298d..fae87ce9ecb6dd86eb1e866226d6b12ae678c78b 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -40,17 +40,19 @@ class Deconv2dOp : public ConvPool2dOpBase { MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); const Tensor *filter = this->Input(FILTER); - const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; + const Tensor *output_shape = + this->InputSize() >= 3 ? this->Input(OUTPUT_SHAPE) : nullptr; + const Tensor *bias = this->InputSize() >= 4 ? this->Input(BIAS) : nullptr; Tensor *output = this->Output(OUTPUT); - return functor_(input, filter, bias, output, future); + return functor_(input, filter, bias, output_shape, output, future); } private: kernels::Deconv2dFunctor functor_; protected: - MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_INPUT_TAGS(INPUT, FILTER, OUTPUT_SHAPE, BIAS); MACE_OP_OUTPUT_TAGS(OUTPUT); }; diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc index 7014aa8d8ee86cc55ca2023354cd7971444eb5bf..6815496fe3bd5a801b4881922e38dc515b7a877c 100644 --- a/mace/ops/shape.cc +++ b/mace/ops/shape.cc @@ -23,6 +23,19 @@ void Register_Shape(OperatorRegistryBase *op_registry) { .TypeConstraint("T") .Build(), ShapeOp); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ShapeOp); + + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ShapeOp); +#endif } } // namespace ops diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py index 9af8a48758f6242ab37bcb0d28254f7f611b4b75..1818070aa08433db815262240f641a4853f99451 100644 --- a/mace/python/tools/converter_tool/tensorflow_converter.py +++ b/mace/python/tools/converter_tool/tensorflow_converter.py @@ -151,6 +151,7 @@ class TensorflowConverter(base_converter.ConverterInterface): TFOpType.Div.name: self.convert_elementwise, TFOpType.Min.name: self.convert_elementwise, TFOpType.Max.name: self.convert_elementwise, + TFOpType.Maximum.name: self.convert_elementwise, TFOpType.Neg.name: self.convert_elementwise, TFOpType.Abs.name: self.convert_elementwise, TFOpType.Pow.name: self.convert_elementwise, @@ -372,19 +373,21 @@ class TensorflowConverter(base_converter.ConverterInterface): dilation_val = [1, 1] dilation_arg.ints.extend(dilation_val) else: + mace_check(len(tf_op.inputs) >= 3, + "deconv should have (>=) 3 inputs.") output_shape_arg = op.arg.add() output_shape_arg.name = MaceKeyword.mace_output_shape_str - if len(tf_op.inputs) >= 3: - del op.input[1:] - output_shape_value =\ + if tf_op.inputs[0].op.type == TFOpType.Const.name: + output_shape_value = \ tf_op.inputs[0].eval().astype(np.int32).flat output_shape_arg.ints.extend(output_shape_value) - self._skip_tensor.add(tf_op.inputs[0].name) - del op.input[0] - op.input.extend([tf_op.inputs[2].name, tf_op.inputs[1].name]) else: - output_shape_value = tf_op.get_attr(tf_strides_str) + output_shape_value = {} output_shape_arg.ints.extend(output_shape_value) + del op.input[:] + op.input.extend([tf_op.inputs[2].name, + tf_op.inputs[1].name, + tf_op.inputs[0].name]) def convert_elementwise(self, tf_op): op = self.convert_general_op(tf_op) diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 152995ec65c13392b2c95d56ed9fbdd1cc73971b..a6295b5280e12f5bc84654fb7c91a37954a3ccec 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -980,8 +980,10 @@ class Transformer(base_converter.ConverterInterface): if op.type == MaceOp.Conv2D.name \ or op.type == MaceOp.Deconv2D.name: self.buffer_to_image(op, 1, OpenCLBufferType.CONV2D_FILTER) - if len(op.input) >= 3: + if len(op.input) >= 3 and op.type == MaceOp.Conv2D.name: self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + elif len(op.input) >= 4 and op.type == MaceOp.Deconv2D.name: + self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.DepthwiseConv2d.name: self.buffer_to_image(op, 1, OpenCLBufferType.DW_CONV2D_FILTER) if len(op.input) >= 3: diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index 6a82f8adc32ed5945cfe438eb37e688640346890..5b1d3d34dab1229512bfb6984a3406ae8f7cf433 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -24,6 +24,7 @@ class MemoryOptimizer(object): self.op_mem = {} # op_name->mem_id self.mem_block = {} # mem_id->[size] or mem_id->[x, y] self.total_mem_count = 0 + self.total_cpu_mem_count = 0 self.input_ref_counter = {} self.mem_ref_counter = {} @@ -184,6 +185,15 @@ class GPUMemoryOptimizer(MemoryOptimizer): for arg in op.arg: if arg.name == 'mode' and arg.i == 0: return False + elif op.type == 'Shape': + for i in range(len(op.output)): + mem_id = self.total_cpu_mem_count + self.total_cpu_mem_count += 1 + op_mem_block = self.get_op_mem_block( + op.type, + op.output_shape[i].dims) + self.mem_block[mem_id] = op_mem_block + return False return op.type != 'ImageToBuffer' def get_op_mem_block(self, op_type, output_shape): @@ -191,13 +201,18 @@ class GPUMemoryOptimizer(MemoryOptimizer): if op_type == 'WinogradTransform' or op_type == 'MatMul': mem_block[0] = output_shape[2] mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4) + elif op_type == 'Shape': + mem_block[0] = output_shape[0] + mem_block[1] = 1 else: if len(output_shape) == 2: # only support fc/softmax mem_block[0] = int((output_shape[1] + 3) / 4) mem_block[1] = output_shape[0] - else: + elif len(output_shape) == 4: mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4) mem_block[1] = output_shape[0] * output_shape[1] + else: + raise Exception('output shape dim size is not 2 or 4.') return mem_block def mem_size(self, memory_block):