diff --git a/mace/ops/delay.cc b/mace/ops/delay.cc deleted file mode 100644 index db99723ddd84766128eea0ec56e266d527e2532f..0000000000000000000000000000000000000000 --- a/mace/ops/delay.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This Op is for IfDefined descriptor in Kaldi. -// It defines time offset. -// If time index <= offset, using zeros as output. - -#include -#include - -#include "mace/core/operator.h" - -namespace mace { -namespace ops { - -template -class DelayOp; - -template -class DelayOp : public Operation { - public: - explicit DelayOp(OpConstructContext *context) - : Operation(context), - offset_(Operation::GetOptionalArg("offset", 0)) {} - - MaceStatus Run(OpContext *context) override { - MACE_UNUSED(context); - const Tensor *input = this->Input(0); - Tensor *output = this->Output(0); - MACE_CHECK(offset_ < 0, "offset param should be negative."); - - index_t rank = input->dim_size(); - MACE_CHECK(rank >= 2, "input's rank should >= 2."); - const std::vector &input_shape = input->shape(); - const index_t batch = - std::accumulate(input_shape.begin(), input_shape.end() - 2, 1, - std::multiplies()); - const index_t chunk = input_shape[rank - 2]; - const index_t dim = input_shape[rank - 1]; - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - output->Clear(); - - if (chunk <= -offset_) - return MaceStatus::MACE_SUCCESS; - - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard output_guard(output); - const T *input_data = input->data(); - T *output_data = output->mutable_data(); - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { - for (index_t i = start0; i < end0; i += step0) { - for (index_t j = start1; j < end1; j += step1) { - memcpy(output_data + (i * chunk + j - offset_) * dim, - input_data + (i * chunk + j) * dim, - dim * sizeof(T)); - } - } - }, 0, batch, 1, 0, chunk + offset_, 1); - - return MaceStatus::MACE_SUCCESS; - } - - private: - int offset_; -}; - -void RegisterDelay(OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "Delay", DelayOp, - DeviceType::CPU, float); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/dynamic_lstm.cc b/mace/ops/dynamic_lstm.cc index 9ef15cccdd0005ff1c2621820137c31045307129..fc226c08d112edea0e13d19cc44aa76c1432ea7f 100644 --- a/mace/ops/dynamic_lstm.cc +++ b/mace/ops/dynamic_lstm.cc @@ -14,6 +14,21 @@ // This Op is for Fused-LstmNonlinearityComponent // with prev cell states as inputs in Kaldi. +// prev_out_delay: is the IfDefined componnet's delay value. +// means which previous frame's output will +// be used here as an input. +// prev_cell_delay: similar as prev_out_delay. +// prev_out_offset: output offset. +// prev_out_dim: prev output's dim. +// prev_cell_dim: prev cell's dim. +// bias_a: the first affine's bias' flag, 1:has bias; 0:no bias. +// bias_b: similar to bias_a. +// scale: scale value of previous output and cell. +// forward_indexes: contains the index of frames will be used for computaion. +// This is pre-computed in kaldi-onnx converter +// cell_cache_indexes: indicates which frame's cell will be cached for next +// computation. +// out_cache_indexes: similar to cell_cache_indexes. // http://kaldi-asr.org/doc/nnet-combined-component_8h_source.html#l00255 // More details are in docs/development/dynamic_lstm.md @@ -50,7 +65,44 @@ class DynamicLSTMOp : public Operation { prev_cell_dim_(Operation::GetOptionalArg("prev_cell_dim", 0)), has_bias_a_(Operation::GetOptionalArg("bias_a", 1)), has_bias_b_(Operation::GetOptionalArg("bias_b", 1)), - scale_(Operation::GetOptionalArg("scale", 1.0f)) {} + scale_(Operation::GetOptionalArg("scale", 1.0f)), + subsample_factor_( + Operation::GetOptionalArg("subsample_factor", 1)), + forward_indexes_( + Operation::GetRepeatedArgs("forward_indexes")), + cell_cache_indexes_( + Operation::GetRepeatedArgs("cell_cache_indexes")), + out_cache_indexes_( + Operation::GetRepeatedArgs("out_cache_indexes")) {} + + inline void Validate() { + const Tensor *input = this->Input(0); + const unsigned int rank = static_cast(input->dim_size()); + MACE_CHECK(rank >= 2, "DynamicLSTM's input should have at least 2 dims."); + const index_t input_chunk = input->dim(rank - 2); + for (size_t i = 0; i < forward_indexes_.size(); ++i) { + MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0, + "index is over range."); + } + + MACE_CHECK(this->InputSize() >= 6, + "DynamicLSTM should have at least six inputs.", + "But has only ", this->InputSize(), " inputs."); + MACE_CHECK(prev_cell_delay_ < 0 && prev_out_delay_ < 0, + "prev_cell_delay(", prev_cell_delay_, + ") and prev_out_delay(", prev_out_delay_, + ") should be less than zero."); + MACE_CHECK(prev_cell_delay_ % subsample_factor_ == 0 && + prev_out_delay_ % subsample_factor_ == 0, + "prev_cell_delay(", prev_cell_delay_, + ") and prev_out_delay(", prev_out_delay_, + ") should be multiples of subsample_factor(", + subsample_factor_, ")."); + MACE_CHECK(prev_out_dim_ > 0 && prev_cell_dim_ > 0, + "prev_out_dim(", prev_out_dim_, + ") and prev_cell_dim(", prev_cell_dim_, + ") should be greater than zero."); + } void UpdateCell(float *cell_data, const index_t cell_dim, @@ -65,7 +117,7 @@ class DynamicLSTMOp : public Operation { in_vec = vmulq_f32(in_vec, scale_vec); vst1q_f32(cell_data + i, in_vec); #else - for (int j = 0; j < 4; ++j) { + for (index_t j = 0; j < 4; ++j) { cell_data[i + j] *= scale; } #endif @@ -92,7 +144,7 @@ class DynamicLSTMOp : public Operation { in_vec = vmulq_f32(in_vec, scale_vec); vst1q_f32(cell_data + i, in_vec); #else - for (int j = 0; j < 4; ++j) { + for (index_t j = 0; j < 4; ++j) { cell_data[i + j] = src_data[i + j] * scale; } #endif @@ -104,32 +156,26 @@ class DynamicLSTMOp : public Operation { MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); - int max_input_num = 4; - MACE_CHECK(this->InputSize() >= max_input_num, - "DynamicLSTM has at least four inputs."); - MACE_CHECK(prev_cell_delay_ < 0 && prev_out_delay_ < 0); - MACE_CHECK(prev_out_dim_ > 0 && prev_cell_dim_ > 0); + Validate(); const Tensor *input = this->Input(INPUT); + const Tensor *prev_out = this->Input(PREV_OUT); + const Tensor *prev_cell = this->Input(PREV_CELL); const Tensor *weights_a = this->Input(WEIGHTS_A); const Tensor *lstm_params = this->Input(PARAMS); const Tensor *weights_b = this->Input(WEIGHTS_B); - if (has_bias_a_) { - max_input_num++; - MACE_CHECK(this->InputSize() >= max_input_num, - "The first affine needs a bias input."); - } + int max_input_num = 6; + max_input_num = has_bias_a_ ? max_input_num + 1 : max_input_num; + MACE_CHECK(this->InputSize() >= max_input_num, + "The first affine needs a bias input."); const Tensor *bias_a = has_bias_a_ ? this->Input(max_input_num - 1) : nullptr; - if (has_bias_b_) { - max_input_num++; - MACE_CHECK(this->InputSize() >= max_input_num, - "The second affine needs a bias input."); - } + max_input_num = has_bias_b_ ? max_input_num + 1 : max_input_num; + MACE_CHECK(this->InputSize() >= max_input_num, + "The second affine needs a bias input."); const Tensor *bias_b = has_bias_b_ ? this->Input(max_input_num - 1) : nullptr; - const index_t input_rank = input->dim_size(); MACE_CHECK(input_rank >= 2, "Dynamic LSTM Cell's input dim size should be >= 2."); @@ -150,12 +196,15 @@ class DynamicLSTMOp : public Operation { const index_t lstm_input_dim = affine_a_out_dim + prev_cell_dim_; const index_t lstm_cell_dim = lstm_input_dim / 5; const index_t params_stride = lstm_params->dim(1); - MACE_CHECK(lstm_input_dim == (lstm_cell_dim * 5)); + MACE_CHECK(lstm_input_dim == (lstm_cell_dim * 5), + "lstm_input_dim(", lstm_input_dim, + ") should be 5 times of lstm_cell_dim(", + lstm_cell_dim, ")."); MACE_CHECK(lstm_params->dim(0) == 3 && params_stride == lstm_cell_dim && lstm_cell_dim == prev_cell_dim_) - << "lstm params rows:" << lstm_params->dim(0) - << "params_stride:" << params_stride - << "!=" << "cell_dim:" << lstm_cell_dim << std::endl; + << " lstm params rows: " << lstm_params->dim(0) + << " params_stride: " << params_stride + << " != " << " cell_dim: " << lstm_cell_dim << std::endl; const index_t affine_b_out_dim = weights_b->dim(0); const index_t affine_b_depth = weights_b->dim(1); const index_t affine_b_in_dim = lstm_cell_dim; @@ -164,7 +213,10 @@ class DynamicLSTMOp : public Operation { << "!=" << "affine_b's weights' depth:" << affine_b_depth << std::endl; const index_t output_dim = affine_b_out_dim; - MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim); + MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim) + << " prev_out_offset: " << prev_out_offset_ + << " prev_out_dim: " << prev_out_dim_ + << " output_dim: " << output_dim; const index_t affine_a_in_size = PadAlignSize(affine_a_in_dim * sizeof(float)); @@ -175,8 +227,8 @@ class DynamicLSTMOp : public Operation { const index_t affine_b_out_size = PadAlignSize(affine_b_out_dim * sizeof(float)); - const int out_buf_chunk = abs(prev_out_delay_); - const int cell_buf_chunk = abs(prev_cell_delay_); + const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_); + const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_); const index_t out_buf_size = PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float)); const index_t cell_buf_size = @@ -187,13 +239,13 @@ class DynamicLSTMOp : public Operation { + affine_b_in_size + affine_b_out_size + out_buf_size + cell_buf_size); - Tensor prev_out(scratch->Scratch(out_buf_size), DT_FLOAT); - prev_out.Reshape({out_buf_chunk, prev_out_dim_}); - float *prev_out_data = prev_out.mutable_data(); + Tensor prev_out_buf(scratch->Scratch(out_buf_size), DT_FLOAT); + prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_}); + float *prev_out_buf_data = prev_out_buf.mutable_data(); - Tensor prev_cell(scratch->Scratch(cell_buf_size), DT_FLOAT); - prev_cell.Reshape({cell_buf_chunk, prev_cell_dim_}); - float *prev_cell_data = prev_cell.mutable_data(); + Tensor prev_cell_buf(scratch->Scratch(cell_buf_size), DT_FLOAT); + prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_}); + float *prev_cell_buf_data = prev_cell_buf.mutable_data(); Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT); affine_a_in.Reshape({1, affine_a_in_dim}); @@ -212,38 +264,57 @@ class DynamicLSTMOp : public Operation { float *affine_b_out_data = affine_b_out.mutable_data(); Tensor *output = this->Output(OUTPUT); + Tensor *out_cache = this->Output(OUT_CACHE); + Tensor *cell_cache = this->Output(CELL_CACHE); std::vector output_shape = input->shape(); + const index_t out_chunk = forward_indexes_.size(); output_shape[input_rank - 1] = output_dim; + std::vector prev_out_shape = input->shape(); + prev_out_shape[input_rank - 1] = prev_out_dim_; + prev_out_shape[input_rank - 2] = out_buf_chunk; + std::vector prev_cell_shape = input->shape(); + prev_cell_shape[input_rank - 1] = prev_cell_dim_; + prev_cell_shape[input_rank - 2] = cell_buf_chunk; MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + MACE_RETURN_IF_ERROR(out_cache->Resize(prev_out_shape)); + MACE_RETURN_IF_ERROR(cell_cache->Resize(prev_cell_shape)); Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard prev_out_guard(prev_out); + Tensor::MappingGuard prev_cell_guard(prev_cell); Tensor::MappingGuard lstm_params_guard(lstm_params); + Tensor::MappingGuard output_guard(output); + Tensor::MappingGuard out_cache_guard(out_cache); + Tensor::MappingGuard cell_cache_guard(cell_cache); + const float *input_data = input->data(); + const float *prev_out_data = prev_out->data(); + const float *prev_cell_data = prev_cell->data(); const float *lstm_params_data = lstm_params->data(); float *output_data = output->mutable_data(); + float *out_cache_data = out_cache->mutable_data(); + float *cell_cache_data = cell_cache->mutable_data(); for (int b = 0; b < batch; ++b) { - int prev_out_idx = prev_out_delay_; - int prev_cell_idx = prev_cell_delay_; - prev_cell.Clear(); - prev_out.Clear(); - affine_a_in.Clear(); - affine_a_out.Clear(); - affine_b_in.Clear(); - affine_b_out.Clear(); - for (int i = 0; i < chunk; ++i) { - const float *input_ptr = input_data + (b * chunk + i) * input_dim; - float *output_ptr = output_data + (b * chunk + i) * output_dim; + memcpy(prev_out_buf_data, + prev_out_data + b * out_buf_chunk * prev_out_dim_, + sizeof(float) * out_buf_chunk * prev_out_dim_); + memcpy(prev_cell_buf_data, + prev_cell_data + b * cell_buf_chunk * prev_cell_dim_, + sizeof(float) * cell_buf_chunk * prev_cell_dim_); + + for (index_t i = 0; i < out_chunk; ++i) { + const float *input_ptr = + input_data + (b * chunk + forward_indexes_[i]) * input_dim; + float *output_ptr = output_data + (b * out_chunk + i) * output_dim; // Append memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(float)); - if (prev_out_idx >= 0) { - memcpy(affine_a_in_data + input_dim, - prev_out_data + prev_out_idx % out_buf_chunk * prev_out_dim_, - prev_out_dim_ * sizeof(float)); - } + memcpy(affine_a_in_data + input_dim, + prev_out_buf_data + i % out_buf_chunk * prev_out_dim_, + prev_out_dim_ * sizeof(float)); // Affine gemv_.Compute(context, weights_a, @@ -256,15 +327,13 @@ class DynamicLSTMOp : public Operation { false, &affine_a_out); // Prepare LSTMNonlinear input and output pointer - float *prev_cell_ptr = - prev_cell_idx < 0 ? nullptr : - prev_cell_data + prev_cell_idx % cell_buf_chunk * prev_cell_dim_; - float *curr_cell_ptr = - prev_cell_data + i % cell_buf_chunk * prev_cell_dim_; + float *lstm_cell_ptr = + prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_; + float *curr_cell_ptr = lstm_cell_ptr; // LSTMNonlinear LSTMNonlinearKernel(context, affine_a_out_data, - prev_cell_ptr, + lstm_cell_ptr, nullptr, lstm_params_data, false, @@ -289,16 +358,36 @@ class DynamicLSTMOp : public Operation { affine_b_out_data, output_dim * sizeof(float)); // Update - float *curr_out_ptr = prev_out_data + i % out_buf_chunk * prev_out_dim_; + float *curr_out_ptr = + prev_out_buf_data + i % out_buf_chunk * prev_out_dim_; CopyAndUpdateCell(affine_b_out_data + prev_out_offset_, prev_out_dim_, scale_, curr_out_ptr); - prev_out_idx++; - prev_cell_idx++; + + for (size_t k = 0; k < out_cache_indexes_.size(); ++k) { + if (i == out_cache_indexes_[k]) { + const index_t idx = b * out_buf_chunk + k; + float *out_cache_ptr = + out_cache_data + idx * prev_out_dim_; + memcpy(out_cache_ptr, + curr_out_ptr, + sizeof(float) * prev_out_dim_); + } + } + + for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) { + if (i == cell_cache_indexes_[k]) { + const index_t idx = b * cell_buf_chunk + k; + float *cell_cache_ptr = + cell_cache_data + idx * prev_cell_dim_; + memcpy(cell_cache_ptr, + curr_cell_ptr, + sizeof(float) * prev_cell_dim_); + } + } } } - return MaceStatus::MACE_SUCCESS; } @@ -311,6 +400,10 @@ class DynamicLSTMOp : public Operation { int has_bias_a_; int has_bias_b_; float scale_; + int subsample_factor_; + std::vector forward_indexes_; + std::vector cell_cache_indexes_; + std::vector out_cache_indexes_; #ifdef MACE_ENABLE_NEON arm::fp32::Gemv gemv_; @@ -318,8 +411,8 @@ class DynamicLSTMOp : public Operation { ref::Gemv gemv_; #endif // MACE_ENABLE_NEON - MACE_OP_INPUT_TAGS(INPUT, WEIGHTS_A, PARAMS, WEIGHTS_B); - MACE_OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, PREV_OUT, PREV_CELL, WEIGHTS_A, PARAMS, WEIGHTS_B); + MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_CACHE, CELL_CACHE); }; void RegisterDynamicLSTM(OpRegistryBase *op_registry) { diff --git a/mace/ops/extract_pooling.cc b/mace/ops/extract_pooling.cc index 3908ceaf391fd88c45db6defbb1a45e2031b88a7..87264f4f66ff04c2bd0c17959450cf9add9532de 100644 --- a/mace/ops/extract_pooling.cc +++ b/mace/ops/extract_pooling.cc @@ -12,17 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -// This Op is for fused StatisticsExtraction, StatisticsPooling and -// Round Components in Kaldi. +// This Op is for fused StatisticsExtraction and StatisticsPooling +// Components in Kaldi. // This op is used to extract moving-average mean and standard-deviation // statistics of input data. -// 'input_indexes' indicates which frames will be used for extract statistics. -// 'output_indexes' indicates which frames of outputs will be used to +// 'forward_indexes' indicates which frames of input will be used for +// extraction. // save statistics results. -// 'modulus' will be used for extent results to all frames. -// 'start_index' and 'end_index' indicate time indexes of output frames. // 'forward_indexes' and 'count' were from precomputed index in kaldi. -// Reference to +// Reference to tools/extract_pooling.py and // http://kaldi-asr.org/doc/nnet-general-component_8h_source.html#l00158 #include @@ -42,7 +40,6 @@ class ExtractPoolingOp : public Operation { public: explicit ExtractPoolingOp(OpConstructContext *context) : Operation(context), - modulus_(Operation::GetOptionalArg("modulus", 1)), include_variance_( static_cast( Operation::GetOptionalArg("include_variance", 0))), @@ -50,39 +47,36 @@ class ExtractPoolingOp : public Operation { Operation::GetOptionalArg("num_log_count", 0)), variance_floor_( Operation::GetOptionalArg("variance_floor", 1.0e-10)), - input_indexes_(Operation::GetRepeatedArgs("input_indexes")), - output_indexes_(Operation::GetRepeatedArgs("output_indexes")), forward_indexes_(Operation::GetRepeatedArgs("forward_indexes")), - counts_(Operation::GetRepeatedArgs("counts")), - input_time_range_(Operation::GetRepeatedArgs("input_time_range")), - output_time_range_( - Operation::GetRepeatedArgs("output_time_range")) {} + counts_(Operation::GetRepeatedArgs("counts")) {} + + inline void Validate() { + const Tensor *input = this->Input(0); + const unsigned int rank = static_cast(input->dim_size()); + MACE_CHECK(rank >= 2, + "ExtractPooling only supports input dim size >= 2"); + MACE_CHECK(counts_.size() * 2 == forward_indexes_.size(), + "counts length(", counts_.size(), + ") should be 2 times of forward_indexes length(", + forward_indexes_.size(), ")."); + for (size_t i = 0; i < counts_.size(); ++i) { + MACE_CHECK(static_cast(counts_[i]) == + forward_indexes_[2 * i + 1] - forward_indexes_[2 * i], + "invalid forward indexes and counts values"); + } + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); const Tensor *input = this->Input(0); Tensor *output = this->Output(0); - + Validate(); const std::vector &input_shape = input->shape(); - const index_t dim_size = input_shape.size(); - MACE_CHECK(dim_size >= 2, - "ExtractPooling only supports input dim size >= 2"); - MACE_CHECK(modulus_ >= 1, - "ExtractPooling's pooling size should be greater than zero."); - MACE_CHECK(input_time_range_.size() == 2 && output_time_range_.size() == 2 - && counts_.size() * 2 == forward_indexes_.size() - && counts_.size() == output_indexes_.size()); - int in_start_index = input_time_range_[0]; - int out_start_index = output_time_range_[0]; - int out_end_index = output_time_range_[1]; - MACE_CHECK(out_end_index >= out_start_index - && input_time_range_[1] >= input_time_range_[0], - "end index should be greater than start index."); - const index_t output_chunk = out_end_index - out_start_index + 1; + const unsigned int dim_size = static_cast(input->dim_size()); + const index_t input_dim = input_shape[dim_size - 1]; const index_t chunk = input_shape[dim_size - 2]; - MACE_CHECK(chunk == input_time_range_[1] - input_time_range_[0] + 1, - "input chunk should be equal to end - start + 1."); + const index_t output_chunk = counts_.size(); const index_t batch = std::accumulate(input_shape.begin(), input_shape.end() - 2, 1, std::multiplies()); @@ -94,10 +88,6 @@ class ExtractPoolingOp : public Operation { output_shape[dim_size - 2] = output_chunk; MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - const index_t num_input_indexes = input_indexes_.size(); - const index_t num_output_indexes = output_indexes_.size(); - MACE_CHECK(num_input_indexes > 0 && num_output_indexes > 0, - "ExtractPooling's input_indexes or output_indexes is empty."); const index_t extract_out_size = PadAlignSize(output_dim * sizeof(float)); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); @@ -117,7 +107,7 @@ class ExtractPoolingOp : public Operation { &thread_pool = context->device()->cpu_runtime()->thread_pool(); for (index_t b = 0; b < batch; ++b) { - for (index_t i = 0; i < num_output_indexes; ++i) { + for (index_t i = 0; i < output_chunk; ++i) { int start = forward_indexes_[2 * i]; int end = forward_indexes_[2 * i + 1]; float count = counts_[i]; @@ -139,7 +129,7 @@ class ExtractPoolingOp : public Operation { float variance = 0.f; for (int t = start; t < end; ++t) { index_t input_index = - (b * chunk + input_indexes_[t] - in_start_index) + (b * chunk + t) * input_dim; float x = input_data[input_index + d]; mean += x; @@ -163,30 +153,15 @@ class ExtractPoolingOp : public Operation { float mean = 0.f; for (int t = start; t < end; ++t) { index_t input_index = - (b * chunk + input_indexes_[t] - in_start_index) - * input_dim; + (b * chunk + t) * input_dim; mean += input_data[input_index + d]; } extract_out_data[d + num_log_count_] = mean * mean_scale; } }, 0, input_dim, 1); } - - int output_start = output_indexes_[i] < out_start_index ? - out_start_index : output_indexes_[i]; - int output_end = output_indexes_[i] + modulus_; - output_end = output_end > out_end_index ? - out_end_index + 1 : - output_end; - thread_pool.Compute1D([=](index_t start0, - index_t end0, - index_t step0) { - for (index_t idx = start0; idx < end0; idx += step0) { - memcpy(output_data + (b * output_chunk + idx - out_start_index) - * output_dim, - extract_out_data, output_dim * sizeof(float)); - } - }, output_start, output_end, 1); + memcpy(output_data + (b * output_chunk + i) * output_dim, + extract_out_data, output_dim * sizeof(float)); } } @@ -194,16 +169,11 @@ class ExtractPoolingOp : public Operation { } private: - int modulus_; bool include_variance_; int num_log_count_; float variance_floor_; - std::vector input_indexes_; - std::vector output_indexes_; std::vector forward_indexes_; std::vector counts_; - std::vector input_time_range_; - std::vector output_time_range_; }; void RegisterExtractPooling(OpRegistryBase *op_registry) { diff --git a/mace/ops/ifdefined.cc b/mace/ops/ifdefined.cc new file mode 100644 index 0000000000000000000000000000000000000000..f0367d20f08d76250bb426da24d5882e6229ab48 --- /dev/null +++ b/mace/ops/ifdefined.cc @@ -0,0 +1,171 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is for IfDefined descriptor in Kaldi. +// It defines time offset. +// If time index <= offset, using zeros as output. +// forward_indexes: indicates which frames will be used for computation. +// Because of the model's subsampling, this is pre-computed +// in kaldi-onnx. +// cache_forward_indexes: indicates which frames of cached previous output +// will be used here. If there is only one input, +// this parameter will be empty. + +#include +#include + +#include "mace/core/operator.h" + +namespace mace { +namespace ops { + +template +class IfDefinedOp; + +template +class IfDefinedOp : public Operation { + public: + explicit IfDefinedOp(OpConstructContext *context) + : Operation(context), + forward_indexes_( + Operation::GetRepeatedArgs("forward_indexes")), + cache_forward_indexes_( + Operation::GetRepeatedArgs("cache_forward_indexes")) {} + + inline void Validate() { + MACE_CHECK(this->InputSize() <= 2, + "IfDefined Op should have at most 2 inputs."); + const Tensor *input = this->Input(INPUT); + const unsigned int rank = static_cast(input->dim_size()); + MACE_CHECK(rank >= 2, "IfDefined's input should have at least 2 dims."); + const index_t input_chunk = input->dim(rank - 2); + for (size_t i = 0; i < forward_indexes_.size(); ++i) { + MACE_CHECK(forward_indexes_[i] < input_chunk, + "forward index is over range."); + } + for (size_t i = 0; i < cache_forward_indexes_.size(); ++i) { + MACE_CHECK(cache_forward_indexes_[i] < input_chunk && + cache_forward_indexes_[i] >= 0 , + "index is over range."); + } + + if (this->InputSize() == 2) { + size_t cache_count = 0; + for (size_t i = 0; i < forward_indexes_.size(); ++i) { + if (forward_indexes_[i] < 0) + cache_count++; + else + break; + } + MACE_CHECK(cache_forward_indexes_.size() == cache_count, + "IfDefined's cache forward index size:", + cache_forward_indexes_.size(), + " != forward indexes' negative part length:", + cache_count); + for (size_t i = 0; i < cache_forward_indexes_.size(); ++i) { + MACE_CHECK(cache_forward_indexes_[i] < input_chunk && + cache_forward_indexes_[i] >= 0, + "cache forward index is over range."); + } + const Tensor *cache_input = this->Input(CACHE_INPUT); + MACE_CHECK(cache_input->dim_size() == input->dim_size(), + "two inputs should have the same rank"); + for (unsigned int k = 0; k < rank; ++k) { + MACE_CHECK(input->dim(k) == cache_input->dim(k), + "Two inputs should have the same shape"); + } + } + } + + void DelayCopy(OpContext *context, + const T *input_data, + const index_t batch, + const index_t chunk, + const index_t dim, + const std::vector &fwd_idxs, + T *output_data) { + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + if (fwd_idxs[j] >= 0) { + memcpy(output_data + (i * chunk + j) * dim, + input_data + (i * chunk + fwd_idxs[j]) * dim, + dim * sizeof(T)); + } + } + } + }, 0, batch, 1, 0, fwd_idxs.size(), 1); + } + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); + Validate(); + index_t rank = input->dim_size(); + const std::vector &input_shape = input->shape(); + const index_t batch = + std::accumulate(input_shape.begin(), input_shape.end() - 2, 1, + std::multiplies()); + const index_t chunk = input_shape[rank - 2]; + const index_t dim = input_shape[rank - 1]; + std::vector output_shape(input->shape()); + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + output->Clear(); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + DelayCopy(context, + input_data, + batch, + chunk, + dim, + forward_indexes_, + output_data); + + if (this->InputSize() == 2 && cache_forward_indexes_.size() > 0) { + const Tensor *cache_input = this->Input(CACHE_INPUT); + Tensor::MappingGuard cache_input_guard(cache_input); + const T *cache_input_data = cache_input->data(); + DelayCopy(context, + cache_input_data, + batch, + chunk, + dim, + cache_forward_indexes_, + output_data); + } + return MaceStatus::MACE_SUCCESS; + } + + private: + std::vector forward_indexes_; + std::vector cache_forward_indexes_; + + private: + MACE_OP_INPUT_TAGS(INPUT, CACHE_INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; + +void RegisterIfDefined(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/registry/ops_registry.cc b/mace/ops/registry/ops_registry.cc index e2dcc276b6a4c38497ec00d77b7eb6e851cc40b4..1af424f1b3eaf742c29788746c20bc0eb2d5de4d 100644 --- a/mace/ops/registry/ops_registry.cc +++ b/mace/ops/registry/ops_registry.cc @@ -42,7 +42,7 @@ extern void RegisterFill(OpRegistryBase *op_registry); extern void RegisterFullyConnected(OpRegistryBase *op_registry); extern void RegisterGather(OpRegistryBase *op_registry); extern void RegisterIdentity(OpRegistryBase *op_registry); -extern void RegisterDelay(OpRegistryBase *op_registry); +extern void RegisterIfDefined(OpRegistryBase *op_registry); extern void RegisterInferConv2dShape(OpRegistryBase *op_registry); extern void RegisterKaldiBatchNorm(OpRegistryBase *op_registry); extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry); @@ -56,6 +56,7 @@ extern void RegisterPadContext(OpRegistryBase *op_registry); extern void RegisterPNorm(OpRegistryBase *op_registry); extern void RegisterPooling(OpRegistryBase *op_registry); extern void RegisterReduce(OpRegistryBase *op_registry); +extern void RegisterReplaceIndex(OpRegistryBase *op_registry); extern void RegisterPriorBox(OpRegistryBase *op_registry); extern void RegisterReshape(OpRegistryBase *op_registry); extern void RegisterResizeBicubic(OpRegistryBase *op_registry); @@ -74,6 +75,7 @@ extern void RegisterSqrDiffMean(OpRegistryBase *op_registry); extern void RegisterSqueeze(OpRegistryBase *op_registry); extern void RegisterStack(OpRegistryBase *op_registry); extern void RegisterStridedSlice(OpRegistryBase *op_registry); +extern void RegisterSubsample(OpRegistryBase *op_registry); extern void RegisterSumGroup(OpRegistryBase *op_registry); extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry); extern void RegisterTile(OpRegistryBase *op_registry); @@ -119,7 +121,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterFullyConnected(this); ops::RegisterGather(this); ops::RegisterIdentity(this); - ops::RegisterDelay(this); + ops::RegisterIfDefined(this); ops::RegisterInferConv2dShape(this); ops::RegisterKaldiBatchNorm(this); ops::RegisterLocalResponseNorm(this); @@ -133,6 +135,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterPNorm(this); ops::RegisterPooling(this); ops::RegisterReduce(this); + ops::RegisterReplaceIndex(this); ops::RegisterPriorBox(this); ops::RegisterReshape(this); ops::RegisterResizeBicubic(this); @@ -151,6 +154,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterStridedSlice(this); ops::RegisterSqrDiffMean(this); ops::RegisterSqueeze(this); + ops::RegisterSubsample(this); ops::RegisterSumGroup(this); ops::RegisterTargetRMSNorm(this); ops::RegisterTile(this); diff --git a/mace/ops/replace_index.cc b/mace/ops/replace_index.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4f95323f84b70815ed7850c8593cd8d7f40c4a3 --- /dev/null +++ b/mace/ops/replace_index.cc @@ -0,0 +1,103 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is for ReplaceIndex in Kaldi. +// Usually used for ivector inputs. +// It copies ivector to each frame of the output. +// forward_indexes: is the pre-computed indexes for output frames. + +#include +#include + +#include "mace/core/operator.h" + +namespace mace { +namespace ops { + +template +class ReplaceIndexOp; + +template +class ReplaceIndexOp : public Operation { + public: + explicit ReplaceIndexOp(OpConstructContext *context) + : Operation(context), + forward_indexes_( + Operation::GetRepeatedArgs("forward_indexes")) {} + + inline void Validate() { + const Tensor *input = this->Input(0); + const unsigned int rank = static_cast(input->dim_size()); + MACE_CHECK(rank >= 2, "ReplaceIndex's input should have at least 2 dims."); + + const index_t input_chunk = input->dim(rank - 2); + for (size_t i = 0; i < forward_indexes_.size(); ++i) { + MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0 , + "index is over range."); + } + } + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + Validate(); + const std::vector &input_shape = input->shape(); + const index_t batch = + std::accumulate(input->shape().begin(), input->shape().end() - 2, 1, + std::multiplies()); + const index_t rank = input->dim_size(); + const index_t num_ivectors = input_shape[rank - 2]; + const index_t dim = input_shape[rank - 1]; + const index_t input_stride = num_ivectors * dim; + + const index_t out_chunk = forward_indexes_.size(); + const index_t output_stride = out_chunk * dim; + + std::vector output_shape = input->shape(); + output_shape[rank - 2] = out_chunk; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t i = start1; i < end1; i += step1) { + memcpy(output_data + b * output_stride + i * dim, + input_data + b * input_stride + forward_indexes_[i] * dim, + dim * sizeof(T)); + } + } + }, 0, batch, 1, 0, out_chunk, 1); + + return MaceStatus::MACE_SUCCESS; + } + + private: + std::vector forward_indexes_; +}; + +void RegisterReplaceIndex(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/splice.cc b/mace/ops/splice.cc index f63e9e5e4c2555ba3436f1775b83014e0517251b..8f9198c00079f1c364bbc49b7b7c011cd384dd3d 100644 --- a/mace/ops/splice.cc +++ b/mace/ops/splice.cc @@ -22,6 +22,9 @@ // if const_component_dim_ != 0, const_dim_ will be used to determine which // row of "in" we copy the last part of each row of "out" from (this part is // not subject to splicing, it's assumed constant for each frame of "input". +// forward_indexes and forward_const_indexes indicate which frames will +// be used for computation, and they are precomputed in kaldi-onnx converter +// becase of supporting subsample. #include #include @@ -40,21 +43,45 @@ class SpliceOp : public Operation { public: explicit SpliceOp(OpConstructContext *context) : Operation(context), - context_(Operation::GetRepeatedArgs("context")), + context_(Operation::GetRepeatedArgs("context")), const_dim_( - Operation::GetOptionalArg("const_component_dim", 0)) {} + Operation::GetOptionalArg("const_component_dim", 0)), + forward_indexes_( + Operation::GetRepeatedArgs("forward_indexes")), + forward_const_indexes_( + Operation::GetRepeatedArgs("forward_const_indexes")) {} + + inline void Validate() { + MACE_CHECK(context_.size() > 0) + << "The context param should not be empty in Splice Op."; + MACE_CHECK(forward_indexes_.size() % context_.size() == 0, + "Splice's forward indexes should be multiply of num splice."); + const Tensor *input = this->Input(0); + const unsigned int rank = static_cast(input->dim_size()); + MACE_CHECK(rank >= 2, "Splice's input should have at least 2 dims."); + MACE_CHECK(input->dim(rank - 1) > const_dim_, + "input dim:", input->dim(rank - 1), + "should be greater than const dim:", const_dim_); + + const index_t input_chunk = input->dim(rank - 2); + for (size_t i = 0; i < forward_indexes_.size(); ++i) { + MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0) + << " forward index:" << forward_indexes_[i] << " input shape:" + << input->dim(0) << "," << input->dim(1) << "," << input->dim(2); + } + for (size_t i = 0; i < forward_const_indexes_.size(); ++i) { + MACE_CHECK(forward_const_indexes_[i] < input_chunk && + forward_const_indexes_[i] >= 0 , + "index is over range."); + } + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); const Tensor *input = this->Input(0); - MACE_CHECK(context_.size() > 0) - << "The context param should not be empty in Splice Op."; - MACE_CHECK(input->dim_size() >= 2) - << "Splice's input's rank should be greater than 2."; - Tensor *output = this->Output(0); + Validate(); const std::vector &input_shape = input->shape(); - const index_t batch = std::accumulate(input->shape().begin(), input->shape().end() - 2, 1, std::multiplies()); @@ -65,14 +92,10 @@ class SpliceOp : public Operation { const index_t num_splice = static_cast(context_.size()); const index_t dim = input_dim - const_dim_; - const index_t left_context = context_[0]; - const index_t right_context = context_[num_splice -1]; - - const index_t out_chunk = chunk - (right_context - left_context); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); - MACE_CHECK(input_dim > const_dim_, - "input dim:", input_dim, - "should be greater than const dim:", const_dim_); + const index_t out_chunk = forward_indexes_.size() / num_splice; const index_t output_dim = dim * num_splice + const_dim_; const index_t output_stride = out_chunk * output_dim; @@ -86,38 +109,48 @@ class SpliceOp : public Operation { const T *input_data = input->data(); T *output_data = output->mutable_data(); - for (int b = 0; b < batch; ++b) { - for (index_t i = 0; i < out_chunk; ++i) { - for (index_t c = 0; c < num_splice; ++c) { - const index_t offset = i + context_[c] - left_context; - T *output_base = - output_data + b * output_stride + i * output_dim + c * dim; - const T *input_base = - input_data + b * input_stride + offset * input_dim; - memcpy(output_base, input_base, dim * sizeof(T)); + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t i = start1; i < end1; i += step1) { + for (index_t c = start2; c < end2; c += step2) { + const index_t pos = forward_indexes_[i * num_splice + c]; + T *output_base = + output_data + b * output_stride + i * output_dim + c * dim; + const T *input_base = + input_data + b * input_stride + pos * input_dim; + memcpy(output_base, input_base, dim * sizeof(T)); + } } } - } + }, 0, batch, 1, 0, out_chunk, 1, 0, num_splice, 1); if (const_dim_ > 0) { const index_t output_offset = output_dim - const_dim_; - const index_t input_offset = dim; - for (int b = 0; b < batch; ++b) { - for (index_t i = 0; i < out_chunk; ++i) { - T *output_base = output_data + b * output_stride + i * output_dim; - const T *input_base = input_data + b * input_stride + i * input_dim; - memcpy(output_base + output_offset, - input_base + input_offset, - const_dim_ * sizeof(T)); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t i = start1; i < end1; i += step1) { + T *output_base = output_data + b * output_stride + + i * output_dim + output_offset; + const T *input_base = + input_data + b * input_stride + + forward_const_indexes_[i] * input_dim + dim; + memcpy(output_base, input_base, + const_dim_ * sizeof(T)); + } } - } + }, 0, batch, 1, 0, out_chunk, 1); } return MaceStatus::MACE_SUCCESS; } private: - std::vector context_; + std::vector context_; int const_dim_; + std::vector forward_indexes_; + std::vector forward_const_indexes_; }; void RegisterSplice(OpRegistryBase *op_registry) { diff --git a/mace/ops/subsample.cc b/mace/ops/subsample.cc new file mode 100644 index 0000000000000000000000000000000000000000..11835ac9987df4499d1686d0b03d547a3cbfd336 --- /dev/null +++ b/mace/ops/subsample.cc @@ -0,0 +1,109 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is created for subsample frames for Kaldi model's inference. +// forward_indexes: indicates which frames will be selected as output. + +#include +#include + +#include "mace/core/operator.h" +#include "mace/utils/math.h" + +namespace mace { +namespace ops { + +template +class SubsampleOp; + +template +class SubsampleOp : public Operation { + public: + explicit SubsampleOp(OpConstructContext *context) + : Operation(context), + forward_indexes_( + Operation::GetRepeatedArgs("forward_indexes")) {} + + inline void Validate() { + const Tensor *input = this->Input(0); + const unsigned int rank = + static_cast(input->dim_size()); + MACE_CHECK(rank >= 2, + "Subsample's input should have at least 2 dims."); + + const index_t input_chunk = input->dim(rank - 2); + for (size_t i = 0; i < forward_indexes_.size(); ++i) { + MACE_CHECK(forward_indexes_[i] < input_chunk && + forward_indexes_[i] >= 0 , + "index is over range."); + } + } + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + Validate(); + const std::vector &input_shape = input->shape(); + + const index_t batch = + std::accumulate(input->shape().begin(), + input->shape().end() - 2, 1, + std::multiplies()); + const index_t rank = input->dim_size(); + const index_t chunk = input_shape[rank - 2]; + const index_t dim = input_shape[rank - 1]; + const index_t input_stride = chunk * dim; + const index_t out_chunk = forward_indexes_.size(); + + const index_t output_stride = out_chunk * dim; + + std::vector output_shape = input->shape(); + output_shape[rank - 2] = out_chunk; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t i = start1; i < end1; i += step1) { + T *output_base = + output_data + b * output_stride + i * dim; + const T *input_base = + input_data + b * input_stride + forward_indexes_[i] * dim; + memcpy(output_base, input_base, dim * sizeof(T)); + } + } + }, 0, batch, 1, 0, out_chunk, 1); + + return MaceStatus::MACE_SUCCESS; + } + + private: + std::vector forward_indexes_; +}; + +void RegisterSubsample(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/target_rms_norm.cc b/mace/ops/target_rms_norm.cc index eab76620bc07518f889ef4cc2c73e8e2c24076f0..23535e15804b476b4b979810f8a3f7663b96b266 100644 --- a/mace/ops/target_rms_norm.cc +++ b/mace/ops/target_rms_norm.cc @@ -71,7 +71,6 @@ class TargetRMSNormOp : public Operation { return result; } - void NormalizePerRow(const float *data, const index_t data_len, float d_scale, @@ -105,9 +104,9 @@ class TargetRMSNormOp : public Operation { std::multiplies()); if (block_dim_ == 0) block_dim_ = static_cast(input_dim); MACE_CHECK(input_dim % block_dim_ == 0, "block_dim must divide input_dim!"); - const index_t output_dim = add_log_stddev_ ? + const index_t output_dim = add_log_stddev_ > 0 ? input_dim + (input_dim / block_dim_) : input_dim; - std::vector output_shape = input->shape(); + std::vector output_shape(input_shape); output_shape[dim_size - 1] = output_dim; MACE_RETURN_IF_ERROR(output->Resize(output_shape)); @@ -140,7 +139,6 @@ class TargetRMSNormOp : public Operation { } }, 0, num_rows, 1); - return MaceStatus::MACE_SUCCESS; } diff --git a/test/ccunit/mace/ops/extract_pooling_test.cc b/test/ccunit/mace/ops/extract_pooling_test.cc index c36e38d2e54e06c13af3c4157a731cb6d099fa2d..978f2e20d0ac23f7a9b34cfa50a944a46b6fc7a8 100644 --- a/test/ccunit/mace/ops/extract_pooling_test.cc +++ b/test/ccunit/mace/ops/extract_pooling_test.cc @@ -28,12 +28,8 @@ void TestExtractPooling(const std::vector &input_shape, const int modulus, const int num_log_count, const int include_variance, - const std::vector &input_time_range, - const std::vector &input_indexes, const std::vector &forward_indexes, const std::vector &counts, - const std::vector &output_indexes, - const std::vector &output_time_range, const std::vector &output_shape, const std::vector &output_value) { // Construct graph @@ -44,12 +40,8 @@ void TestExtractPooling(const std::vector &input_shape, .AddIntArg("modulus", modulus) .AddIntArg("include_variance", include_variance) .AddIntArg("num_log_count", num_log_count) - .AddIntsArg("input_indexes", input_indexes) - .AddIntsArg("output_indexes", output_indexes) .AddIntsArg("forward_indexes", forward_indexes) .AddFloatsArg("counts", counts) - .AddIntsArg("input_time_range", input_time_range) - .AddIntsArg("output_time_range", output_time_range) .Output("Output") .Finalize(net.NewOperatorDef()); // Run @@ -63,123 +55,52 @@ void TestExtractPooling(const std::vector &input_shape, TEST_F(ExtractPoolingTest, SimpleCPU) { TestExtractPooling( {3, 20, 3}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60}, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, + 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, + 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, + 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179}, 9, 0, 0, - {-2, 17}, - {0, 3, 6, 9, 12, 15}, {0, 6, 2, 6}, {6, 4}, - {0, 9}, - {0, 17}, - {3, 18, 3}, - {29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, - 38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5}); + {3, 2, 3}, + {7.5, 8.5, 9.5, 10.5, 11.5, 12.5, + 67.5, 68.5, 69.5, 70.5, 71.5, 72.5, + 127.5, 128.5, 129.5, 130.5, 131.5, 132.5}); } TEST_F(ExtractPoolingTest, SimpleCPUWithVariance) { TestExtractPooling( {3, 20, 3}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60}, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, + 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, + 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, + 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179}, 9, 1, 1, - {-2, 17}, - {0, 3, 6, 9, 12, 15}, {0, 6, 2, 6}, {6, 4}, - {0, 9}, - {0, 17}, - {3, 18, 7}, - {1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623, - 1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623}); + {3, 2, 7}, + {1.7917595, 7.5, 8.5, 9.5, 5.1234756, 5.1234756, 5.1234756, + 1.3862944, 10.5, 11.5, 12.5, 3.354102, 3.354102, 3.354102, + 1.7917595, 67.5, 68.5, 69.5, 5.1234756, 5.1234756, 5.1234756, + 1.3862944, 70.5, 71.5, 72.5, 3.354102, 3.354102, 3.354102, + 1.7917595, 127.5, 128.5, 129.5, 5.1234756, 5.1234756, 5.1234756, + 1.3862944, 130.5, 131.5, 132.5, 3.354102, 3.354102, 3.354102}); } } // namespace test diff --git a/test/ccunit/mace/ops/splice_test.cc b/test/ccunit/mace/ops/splice_test.cc index b6bc3d32c179a4475f0c58c921138be901ee8c2b..07be7cca8bac7e246fc0c63f7b66bba8fda05b7f 100644 --- a/test/ccunit/mace/ops/splice_test.cc +++ b/test/ccunit/mace/ops/splice_test.cc @@ -26,6 +26,8 @@ void TestSplice(const std::vector &input_shape, const std::vector &input, const std::vector &context, const int const_dim, + const std::vector &forward_indexes, + const std::vector &forward_const_indexes, const std::vector &output_shape, const std::vector &output) { OpsTestNet net; @@ -38,6 +40,8 @@ void TestSplice(const std::vector &input_shape, .Output("Output") .AddIntsArg("context", context) .AddIntArg("const_component_dim", const_dim) + .AddIntsArg("forward_indexes", forward_indexes) + .AddIntsArg("forward_const_indexes", forward_const_indexes) .Finalize(net.NewOperatorDef()); net.RunOp(); @@ -53,6 +57,8 @@ TEST_F(SpliceOpTest, WithoutConstDim) { {1, 7, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {-2, -1, 0, 1, 2}, 0, + {0, 1, 2, 3, 4, 1, 2, 3, 4, 5, 2, 3, 4, 5, 6}, + {}, {1, 3, 10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, @@ -68,8 +74,10 @@ TEST_F(SpliceOpTest, WithConstDim) { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {-2, -1, 0, 1, 2}, 7, + {0, 1, 2, 3, 4}, + {2}, {1, 1, 22}, - {1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10}); + {1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8, 9, 10, 11, 12}); } } // namespace test } // namespace ops diff --git a/tools/extract_pooling.py b/tools/extract_pooling.py new file mode 100644 index 0000000000000000000000000000000000000000..54518787e136098158bc27aad1bdaa2773892016 --- /dev/null +++ b/tools/extract_pooling.py @@ -0,0 +1,70 @@ +# Copyright 2019 The MACE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import math + +variance_floor = 1.0e-10 + +input_data = np.arange(180).reshape(3, 20, 3).astype(np.float32) +print("input data:", input_data) +num_log_count = 0 +include_var = 0 +forward_indexes = [0, 6, 2, 6] +counts = [6, 4] + +input_dim = input_data.shape[-1] +input_chunk = input_data.shape[-2] + +out_chunk = len(counts) +batch = input_data.size / (input_dim * input_chunk) +input_data.reshape(batch, input_chunk, input_dim) + +output_dim = input_dim + +if include_var > 0: + output_dim += input_dim +if num_log_count > 0: + output_dim += num_log_count + +output_data = np.zeros((batch, out_chunk, output_dim), dtype=np.float32) + +for b in range(0, batch): + for i in range(0, out_chunk): + start = forward_indexes[2 * i] + end = forward_indexes[2 * i + 1] + count = counts[i] + mean_scale = 1.0 / count + log_count = math.log(count) + if num_log_count > 0: + for n in range(0, num_log_count): + output_data[b, i, n] = log_count + for d in range(0, input_dim): + mean = 0.0 + variance = 0.0 + for t in range(start, end): + x = input_data[b, t, d] + mean += x + variance += x * x + mean = mean * mean_scale + output_data[b, i, d + num_log_count] = mean + if include_var > 0: + variance = variance * mean_scale - mean * mean + idx = d + input_dim + num_log_count + if variance < variance_floor: + output_data[b, i, idx] = math.sqrt(variance_floor) + else: + output_data[b, i, idx] = math.sqrt(variance) +print("output data:", output_data) +print("output data shape:", output_data.shape) diff --git a/tools/python/transform/base_converter.py b/tools/python/transform/base_converter.py index c8aca0377d020ed6329639442c79fb3df120a74b..43edb9a643e68d613948ae2fbab4554f7843920b 100644 --- a/tools/python/transform/base_converter.py +++ b/tools/python/transform/base_converter.py @@ -100,7 +100,6 @@ MaceSupportedOps = [ 'Conv2D', 'Crop', 'Deconv2D', - 'Delay', 'DepthToSpace', 'DepthwiseConv2d', 'DepthwiseDeconv2d', @@ -112,6 +111,7 @@ MaceSupportedOps = [ 'FullyConnected', 'Gather', 'Identity', + 'IfDefined', 'InferConv2dShape', 'KaldiBatchNorm', 'LocalResponseNorm', @@ -128,6 +128,7 @@ MaceSupportedOps = [ 'Proposal', 'Quantize', 'Reduce', + 'ReplaceIndex', 'Reshape', 'ResizeBicubic', 'ResizeBilinear', @@ -147,6 +148,7 @@ MaceSupportedOps = [ 'SpaceToBatchND', 'SpaceToDepth', 'SqrDiffMean', + 'Subsample', 'SumGroup', 'TargetRMSNorm', 'Transpose', @@ -269,6 +271,8 @@ class MaceKeyword(object): mace_reverse_str = 'reverse' mace_const_data_num_arg_str = 'const_data_num' mace_coeff_str = 'coeff' + mace_input_indexes_str = 'input_indexes' + mace_output_indexes_str = 'output_indexes' mace_p_str = 'p' mace_nor_var_str = 'normalize_variance' mace_across_ch_str = 'across_channels' diff --git a/tools/python/transform/onnx_converter.py b/tools/python/transform/onnx_converter.py index aa780d0b0dc082ea27bebccf6a0c33d9577b9cdb..9efaa5eec36abbe595168666aede83362bcbd3ce 100644 --- a/tools/python/transform/onnx_converter.py +++ b/tools/python/transform/onnx_converter.py @@ -152,7 +152,9 @@ OnnxSupportedOps = [ # 'ReduceSum', # 'ReduceSumSquare', 'Relu', + 'ReplaceIndex', 'Reshape', + 'Round', 'Scale', # 'Scan', # 'Selu', @@ -171,6 +173,7 @@ OnnxSupportedOps = [ 'Sqrt', 'Squeeze', 'Sub', + 'Subsample', 'Sum', 'SumGroup', # 'Tan', @@ -363,7 +366,7 @@ class OnnxConverter(base_converter.ConverterInterface): OnnxOpType.Mul.name: self.convert_eltwise, OnnxOpType.Neg.name: self.convert_eltwise, OnnxOpType.Normalize: self.convert_normalize, - OnnxOpType.Offset.name: self.convert_identity, + OnnxOpType.Offset.name: self.convert_subsample, OnnxOpType.Pad.name: self.convert_pad, OnnxOpType.PadContext.name: self.convert_pad_context, OnnxOpType.PNorm.name: self.convert_pnorm, @@ -376,6 +379,8 @@ class OnnxConverter(base_converter.ConverterInterface): OnnxOpType.ReduceMean.name: self.convert_reduce, OnnxOpType.ReduceMin.name: self.convert_reduce, OnnxOpType.ReduceProd.name: self.convert_reduce, + OnnxOpType.ReplaceIndex.name: self.convert_replaceindex, + OnnxOpType.Round.name: self.convert_replaceindex, OnnxOpType.Scale.name: self.convert_eltwise, OnnxOpType.Shape.name: self.convert_shape, OnnxOpType.Sigmoid.name: self.convert_activation, @@ -387,6 +392,7 @@ class OnnxConverter(base_converter.ConverterInterface): OnnxOpType.Sqrt.name: self.convert_eltwise, OnnxOpType.Squeeze.name: self.convert_squeeze, OnnxOpType.Sub.name: self.convert_eltwise, + OnnxOpType.Subsample.name: self.convert_subsample, OnnxOpType.Sum.name: self.convert_eltwise, OnnxOpType.SumGroup.name: self.convert_sum_group, OnnxOpType.Tanh.name: self.convert_activation, @@ -839,56 +845,30 @@ class OnnxConverter(base_converter.ConverterInterface): op = self.convert_general_op(node) op.type = MaceOp.DynamicLSTM.name - if 'prev_out_delay' in node.attrs: - prev_out_delay = node.attrs['prev_out_delay'] - mace_check(prev_out_delay < 0, - "dynamic's prev_out_delay should <= 0.") - prev_out_delay_arg = op.arg.add() - prev_out_delay_arg.name = 'prev_out_delay' - prev_out_delay_arg.i = prev_out_delay - if 'prev_cell_delay' in node.attrs: - prev_cell_delay = node.attrs['prev_cell_delay'] - mace_check(prev_cell_delay < 0, - "dynamic's prev_cell_delay should < 0.") - prev_cell_delay_arg = op.arg.add() - prev_cell_delay_arg.name = 'prev_cell_delay' - prev_cell_delay_arg.i = prev_cell_delay - if 'prev_out_offset' in node.attrs: - prev_out_offset = node.attrs['prev_out_offset'] - mace_check(prev_out_offset >= 0, - "dynamic's prev_out_offset should >= 0.") - prev_out_offset_arg = op.arg.add() - prev_out_offset_arg.name = 'prev_out_offset' - prev_out_offset_arg.i = prev_out_offset - if 'prev_out_dim' in node.attrs: - prev_out_dim = node.attrs['prev_out_dim'] - mace_check(prev_out_dim > 0, - "dynamic's prev_out_dim should > 0.") - prev_out_dim_arg = op.arg.add() - prev_out_dim_arg.name = 'prev_out_dim' - prev_out_dim_arg.i = prev_out_dim - if 'prev_cell_dim' in node.attrs: - prev_cell_dim = node.attrs['prev_cell_dim'] - mace_check(prev_cell_dim > 0, - "dynamic's prev_cell_dim should > 0.") - prev_cell_dim_arg = op.arg.add() - prev_cell_dim_arg.name = 'prev_cell_dim' - prev_cell_dim_arg.i = prev_cell_dim - if 'bias_a' in node.attrs: - bias_a = node.attrs['bias_a'] - bias_a_arg = op.arg.add() - bias_a_arg.name = 'bias_a' - bias_a_arg.i = bias_a - if 'bias_b' in node.attrs: - bias_b = node.attrs['bias_b'] - bias_b_arg = op.arg.add() - bias_b_arg.name = 'bias_b' - bias_b_arg.i = bias_b - if 'scale' in node.attrs: - scale = node.attrs['scale'] - scale_arg = op.arg.add() - scale_arg.name = 'scale' - scale_arg.f = scale + self.copy_node_attr(op, node, 'prev_out_delay', + AttributeType.INT) + self.copy_node_attr(op, node, 'prev_cell_delay', + AttributeType.INT) + self.copy_node_attr(op, node, 'prev_out_offset', + AttributeType.INT) + self.copy_node_attr(op, node, 'prev_out_dim', + AttributeType.INT) + self.copy_node_attr(op, node, 'prev_cell_dim', + AttributeType.INT) + self.copy_node_attr(op, node, 'bias_a', + AttributeType.INT) + self.copy_node_attr(op, node, 'bias_b', + AttributeType.INT) + self.copy_node_attr(op, node, 'scale', + AttributeType.FLOAT) + self.copy_node_attr(op, node, 'subsample_factor', + AttributeType.INT, default=1) + self.copy_node_attr(op, node, 'cell_cache_indexes', + AttributeType.INTS, default=[]) + self.copy_node_attr(op, node, 'out_cache_indexes', + AttributeType.INTS, default=[]) + self.copy_node_attr(op, node, 'forward_indexes', + AttributeType.INTS) def convert_clip(self, node): # If clip's min value is zero, @@ -1019,73 +999,8 @@ class OnnxConverter(base_converter.ConverterInterface): self.copy_node_attr(op, node, 'include_variance', AttributeType.INT) self.copy_node_attr(op, node, 'num_log_count', AttributeType.INT) self.copy_node_attr(op, node, 'variance_floor', AttributeType.FLOAT) - self.copy_node_attr(op, node, 'input_time_range', AttributeType.INTS) - self.copy_node_attr(op, node, 'input_indexes', AttributeType.INTS) - - if 'output_time_range' in node.attrs: - output_time_range = node.attrs['output_time_range'] - mace_check(len(output_time_range) == 2, - "output time range should have two values.") - out_start_index = output_time_range[0] - out_end_index = output_time_range[1] - else: - mace_check('start_index' in node.attrs and - 'end_index' in node.attrs, - "'start_index' and 'end_index'" - " are required in ExtractPooling.") - out_start_index = node.attrs['start_index'] - out_end_index = node.attrs['end_index'] - output_time_range = [out_start_index, out_end_index] - - output_time_range_arg = op.arg.add() - output_time_range_arg.name = 'output_time_range' - output_time_range_arg.ints.extend(output_time_range) - - mace_check('modulus' in node.attrs, - "'modulus' is required in ExtractPooling.") - mace_check('output_indexes' in node.attrs, - "'output_indexes' is required in ExtractPooling.") - mace_check('counts' in node.attrs, - "'counts' is required in ExtractPooling.") - mace_check('forward_indexes' in node.attrs, - "'forward_indexes' is required in ExtractPooling.") - modulus = node.attrs['modulus'] - output_indexes = node.attrs['output_indexes'] - counts = node.attrs['counts'] - forward_indexes = node.attrs['forward_indexes'] - - mace_check(len(counts) == len(output_indexes) and - len(forward_indexes) == 2 * len(output_indexes), - "output_indexes length:%s " - "counts length:%s " - "forward_indexes length:%s" - % (len(output_indexes), len(counts), len(forward_indexes))) - - new_output_indexes = [] - new_forward_indexes = [] - new_counts = [] - for i in range(len(output_indexes)): - if output_indexes[i] + modulus > out_start_index and\ - output_indexes[i] <= out_end_index: - new_output_indexes.append(output_indexes[i]) - new_counts.append(counts[i]) - new_forward_indexes.append(forward_indexes[2 * i]) - new_forward_indexes.append(forward_indexes[2 * i + 1]) - modulus_arg = op.arg.add() - modulus_arg.name = 'modulus' - modulus_arg.i = modulus - - counts_arg = op.arg.add() - counts_arg.name = 'counts' - counts_arg.floats.extend(new_counts) - - forward_indexes_arg = op.arg.add() - forward_indexes_arg.name = 'forward_indexes' - forward_indexes_arg.ints.extend(new_forward_indexes) - - output_indexes_arg = op.arg.add() - output_indexes_arg.name = 'output_indexes' - output_indexes_arg.ints.extend(new_output_indexes) + self.copy_node_attr(op, node, 'counts', AttributeType.FLOATS) + self.copy_node_attr(op, node, 'forward_indexes', AttributeType.INTS) def convert_flatten(self, node): op = self.convert_general_op(node) @@ -1104,19 +1019,14 @@ class OnnxConverter(base_converter.ConverterInterface): def convert_kaldi_batchnorm(self, node): op = self.convert_general_op(node) op.type = MaceOp.KaldiBatchNorm.name - dim = self.copy_node_attr(op, node, - 'dim', AttributeType.INT, -1) - block_dim = self.copy_node_attr(op, node, - 'block_dim', + dim = self.copy_node_attr(op, node, 'dim', AttributeType.INT, -1) + block_dim = self.copy_node_attr(op, node, 'block_dim', AttributeType.INT, -1) - epsilon = self.copy_node_attr(op, node, - 'epsilon', + epsilon = self.copy_node_attr(op, node, 'epsilon', AttributeType.FLOAT, 1e-3) - target_rms = self.copy_node_attr(op, node, - 'target_rms', + target_rms = self.copy_node_attr(op, node, 'target_rms', AttributeType.FLOAT, 1.0) - test_mode = self.copy_node_attr(op, node, - 'test_mode', + test_mode = self.copy_node_attr(op, node, 'test_mode', AttributeType.INT, 0) mace_check(block_dim > 0 and dim % block_dim == 0 and @@ -1165,8 +1075,7 @@ class OnnxConverter(base_converter.ConverterInterface): scale_name = node.name + 'scale' offset_name = node.name + 'offset' - scale_value = ( - (1.0 / np.sqrt( + scale_value = ((1.0 / np.sqrt( var_value + epsilon_value)) * gamma_value) offset_value = (-mean_value * scale_value) + beta_value self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT, @@ -1267,10 +1176,11 @@ class OnnxConverter(base_converter.ConverterInterface): if offset == 0: op.type = MaceOp.Identity.name else: - op.type = MaceOp.Delay.name - offset_arg = op.arg.add() - offset_arg.name = 'offset' - offset_arg.i = node.attrs['offset'] + op.type = MaceOp.IfDefined.name + self.copy_node_attr(op, node, 'forward_indexes', + AttributeType.INTS) + self.copy_node_attr(op, node, 'cache_forward_indexes', + AttributeType.INTS) def convert_imagescaler(self, node): op = self.convert_general_op(node) @@ -1282,10 +1192,10 @@ class OnnxConverter(base_converter.ConverterInterface): scale_name = node.name + "_scale" bias_name = node.name + "_bias" - self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT, - scale_value) - self.add_tensor(bias_name, bias_value.shape, mace_pb2.DT_FLOAT, - bias_value) + self.add_tensor(scale_name, scale_value.shape, + mace_pb2.DT_FLOAT, scale_value) + self.add_tensor(bias_name, bias_value.shape, + mace_pb2.DT_FLOAT, bias_value) op.input.extend([scale_name, bias_name]) def convert_lstm(self, node): @@ -1399,6 +1309,12 @@ class OnnxConverter(base_converter.ConverterInterface): keep_dims_arg.name = MaceKeyword.mace_keepdims_str keep_dims_arg.i = keep_dims + def convert_replaceindex(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.ReplaceIndex.name + self.copy_node_attr(op, node, 'forward_indexes', + AttributeType.INTS) + def convert_reshape(self, node): op = self.convert_general_op(node) op.type = MaceOp.Reshape.name @@ -1460,11 +1376,17 @@ class OnnxConverter(base_converter.ConverterInterface): context_arg.ints.extend(context) if 'const_component_dim' in node.attrs: const_dim = node.attrs['const_component_dim'] - else: - const_dim = 0 - const_dim_arg = op.arg.add() - const_dim_arg.name = 'const_component_dim' - const_dim_arg.i = const_dim + const_dim_arg = op.arg.add() + const_dim_arg.name = 'const_component_dim' + const_dim_arg.i = const_dim + self.copy_node_attr(op, node, + 'forward_const_indexes', + AttributeType.INTS) + + self.copy_node_attr(op, node, 'subsample_factor', + AttributeType.INT, default=1) + self.copy_node_attr(op, node, 'forward_indexes', + AttributeType.INTS) def convert_split(self, node): op = self.convert_general_op(node) @@ -1516,6 +1438,12 @@ class OnnxConverter(base_converter.ConverterInterface): axis_arg.name = MaceKeyword.mace_axis_str axis_arg.ints.extend(axis_value) + def convert_subsample(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Subsample.name + self.copy_node_attr(op, node, 'forward_indexes', + AttributeType.INTS) + def convert_sum_group(self, node): op = self.convert_general_op(node) op.type = MaceOp.SumGroup.name @@ -1524,11 +1452,12 @@ class OnnxConverter(base_converter.ConverterInterface): op = self.convert_general_op(node) op.type = MaceOp.TargetRMSNorm.name - self.copy_node_attr(op, node, 'target_rms', AttributeType.FLOAT) - self.copy_node_attr(op, node, 'add_log_stddev', AttributeType.INT, - default=0) - self.copy_node_attr(op, node, 'block_dim', AttributeType.INT, - default=0) + self.copy_node_attr(op, node, 'target_rms', + AttributeType.FLOAT) + self.copy_node_attr(op, node, 'add_log_stddev', + AttributeType.INT, default=0) + self.copy_node_attr(op, node, 'block_dim', + AttributeType.INT, default=0) def convert_transpose(self, node): op = self.convert_general_op(node)