提交 8bc14517 编写于 作者: 叶剑武

Merge branch 'support-ivector-input' into 'master'

support kaldi ivector input and subsample

See merge request !1210
......@@ -14,6 +14,21 @@
// This Op is for Fused-LstmNonlinearityComponent
// with prev cell states as inputs in Kaldi.
// prev_out_delay: is the IfDefined componnet's delay value.
// means which previous frame's output will
// be used here as an input.
// prev_cell_delay: similar as prev_out_delay.
// prev_out_offset: output offset.
// prev_out_dim: prev output's dim.
// prev_cell_dim: prev cell's dim.
// bias_a: the first affine's bias' flag, 1:has bias; 0:no bias.
// bias_b: similar to bias_a.
// scale: scale value of previous output and cell.
// forward_indexes: contains the index of frames will be used for computaion.
// This is pre-computed in kaldi-onnx converter
// cell_cache_indexes: indicates which frame's cell will be cached for next
// computation.
// out_cache_indexes: similar to cell_cache_indexes.
// http://kaldi-asr.org/doc/nnet-combined-component_8h_source.html#l00255
// More details are in docs/development/dynamic_lstm.md
......@@ -50,7 +65,44 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
prev_cell_dim_(Operation::GetOptionalArg<int>("prev_cell_dim", 0)),
has_bias_a_(Operation::GetOptionalArg<int>("bias_a", 1)),
has_bias_b_(Operation::GetOptionalArg<int>("bias_b", 1)),
scale_(Operation::GetOptionalArg<float>("scale", 1.0f)) {}
scale_(Operation::GetOptionalArg<float>("scale", 1.0f)),
subsample_factor_(
Operation::GetOptionalArg<int>("subsample_factor", 1)),
forward_indexes_(
Operation::GetRepeatedArgs<index_t>("forward_indexes")),
cell_cache_indexes_(
Operation::GetRepeatedArgs<index_t>("cell_cache_indexes")),
out_cache_indexes_(
Operation::GetRepeatedArgs<index_t>("out_cache_indexes")) {}
inline void Validate() {
const Tensor *input = this->Input(0);
const unsigned int rank = static_cast<unsigned int>(input->dim_size());
MACE_CHECK(rank >= 2, "DynamicLSTM's input should have at least 2 dims.");
const index_t input_chunk = input->dim(rank - 2);
for (size_t i = 0; i < forward_indexes_.size(); ++i) {
MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0,
"index is over range.");
}
MACE_CHECK(this->InputSize() >= 6,
"DynamicLSTM should have at least six inputs.",
"But has only ", this->InputSize(), " inputs.");
MACE_CHECK(prev_cell_delay_ < 0 && prev_out_delay_ < 0,
"prev_cell_delay(", prev_cell_delay_,
") and prev_out_delay(", prev_out_delay_,
") should be less than zero.");
MACE_CHECK(prev_cell_delay_ % subsample_factor_ == 0 &&
prev_out_delay_ % subsample_factor_ == 0,
"prev_cell_delay(", prev_cell_delay_,
") and prev_out_delay(", prev_out_delay_,
") should be multiples of subsample_factor(",
subsample_factor_, ").");
MACE_CHECK(prev_out_dim_ > 0 && prev_cell_dim_ > 0,
"prev_out_dim(", prev_out_dim_,
") and prev_cell_dim(", prev_cell_dim_,
") should be greater than zero.");
}
void UpdateCell(float *cell_data,
const index_t cell_dim,
......@@ -65,7 +117,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
in_vec = vmulq_f32(in_vec, scale_vec);
vst1q_f32(cell_data + i, in_vec);
#else
for (int j = 0; j < 4; ++j) {
for (index_t j = 0; j < 4; ++j) {
cell_data[i + j] *= scale;
}
#endif
......@@ -92,7 +144,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
in_vec = vmulq_f32(in_vec, scale_vec);
vst1q_f32(cell_data + i, in_vec);
#else
for (int j = 0; j < 4; ++j) {
for (index_t j = 0; j < 4; ++j) {
cell_data[i + j] = src_data[i + j] * scale;
}
#endif
......@@ -104,32 +156,26 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
int max_input_num = 4;
MACE_CHECK(this->InputSize() >= max_input_num,
"DynamicLSTM has at least four inputs.");
MACE_CHECK(prev_cell_delay_ < 0 && prev_out_delay_ < 0);
MACE_CHECK(prev_out_dim_ > 0 && prev_cell_dim_ > 0);
Validate();
const Tensor *input = this->Input(INPUT);
const Tensor *prev_out = this->Input(PREV_OUT);
const Tensor *prev_cell = this->Input(PREV_CELL);
const Tensor *weights_a = this->Input(WEIGHTS_A);
const Tensor *lstm_params = this->Input(PARAMS);
const Tensor *weights_b = this->Input(WEIGHTS_B);
if (has_bias_a_) {
max_input_num++;
MACE_CHECK(this->InputSize() >= max_input_num,
"The first affine needs a bias input.");
}
int max_input_num = 6;
max_input_num = has_bias_a_ ? max_input_num + 1 : max_input_num;
MACE_CHECK(this->InputSize() >= max_input_num,
"The first affine needs a bias input.");
const Tensor *bias_a = has_bias_a_ ?
this->Input(max_input_num - 1) :
nullptr;
if (has_bias_b_) {
max_input_num++;
MACE_CHECK(this->InputSize() >= max_input_num,
"The second affine needs a bias input.");
}
max_input_num = has_bias_b_ ? max_input_num + 1 : max_input_num;
MACE_CHECK(this->InputSize() >= max_input_num,
"The second affine needs a bias input.");
const Tensor *bias_b = has_bias_b_ ?
this->Input(max_input_num - 1) :
nullptr;
const index_t input_rank = input->dim_size();
MACE_CHECK(input_rank >= 2,
"Dynamic LSTM Cell's input dim size should be >= 2.");
......@@ -150,12 +196,15 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
const index_t lstm_input_dim = affine_a_out_dim + prev_cell_dim_;
const index_t lstm_cell_dim = lstm_input_dim / 5;
const index_t params_stride = lstm_params->dim(1);
MACE_CHECK(lstm_input_dim == (lstm_cell_dim * 5));
MACE_CHECK(lstm_input_dim == (lstm_cell_dim * 5),
"lstm_input_dim(", lstm_input_dim,
") should be 5 times of lstm_cell_dim(",
lstm_cell_dim, ").");
MACE_CHECK(lstm_params->dim(0) == 3 &&
params_stride == lstm_cell_dim && lstm_cell_dim == prev_cell_dim_)
<< "lstm params rows:" << lstm_params->dim(0)
<< "params_stride:" << params_stride
<< "!=" << "cell_dim:" << lstm_cell_dim << std::endl;
<< " lstm params rows: " << lstm_params->dim(0)
<< " params_stride: " << params_stride
<< " != " << " cell_dim: " << lstm_cell_dim << std::endl;
const index_t affine_b_out_dim = weights_b->dim(0);
const index_t affine_b_depth = weights_b->dim(1);
const index_t affine_b_in_dim = lstm_cell_dim;
......@@ -164,7 +213,10 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
<< "!=" << "affine_b's weights' depth:" << affine_b_depth << std::endl;
const index_t output_dim = affine_b_out_dim;
MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim);
MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim)
<< " prev_out_offset: " << prev_out_offset_
<< " prev_out_dim: " << prev_out_dim_
<< " output_dim: " << output_dim;
const index_t affine_a_in_size =
PadAlignSize(affine_a_in_dim * sizeof(float));
......@@ -175,8 +227,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
const index_t affine_b_out_size =
PadAlignSize(affine_b_out_dim * sizeof(float));
const int out_buf_chunk = abs(prev_out_delay_);
const int cell_buf_chunk = abs(prev_cell_delay_);
const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_);
const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_);
const index_t out_buf_size =
PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float));
const index_t cell_buf_size =
......@@ -187,13 +239,13 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
+ affine_b_in_size + affine_b_out_size
+ out_buf_size + cell_buf_size);
Tensor prev_out(scratch->Scratch(out_buf_size), DT_FLOAT);
prev_out.Reshape({out_buf_chunk, prev_out_dim_});
float *prev_out_data = prev_out.mutable_data<float>();
Tensor prev_out_buf(scratch->Scratch(out_buf_size), DT_FLOAT);
prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_});
float *prev_out_buf_data = prev_out_buf.mutable_data<float>();
Tensor prev_cell(scratch->Scratch(cell_buf_size), DT_FLOAT);
prev_cell.Reshape({cell_buf_chunk, prev_cell_dim_});
float *prev_cell_data = prev_cell.mutable_data<float>();
Tensor prev_cell_buf(scratch->Scratch(cell_buf_size), DT_FLOAT);
prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_});
float *prev_cell_buf_data = prev_cell_buf.mutable_data<float>();
Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT);
affine_a_in.Reshape({1, affine_a_in_dim});
......@@ -212,38 +264,57 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
float *affine_b_out_data = affine_b_out.mutable_data<float>();
Tensor *output = this->Output(OUTPUT);
Tensor *out_cache = this->Output(OUT_CACHE);
Tensor *cell_cache = this->Output(CELL_CACHE);
std::vector<index_t> output_shape = input->shape();
const index_t out_chunk = forward_indexes_.size();
output_shape[input_rank - 1] = output_dim;
std::vector<index_t> prev_out_shape = input->shape();
prev_out_shape[input_rank - 1] = prev_out_dim_;
prev_out_shape[input_rank - 2] = out_buf_chunk;
std::vector<index_t> prev_cell_shape = input->shape();
prev_cell_shape[input_rank - 1] = prev_cell_dim_;
prev_cell_shape[input_rank - 2] = cell_buf_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
MACE_RETURN_IF_ERROR(out_cache->Resize(prev_out_shape));
MACE_RETURN_IF_ERROR(cell_cache->Resize(prev_cell_shape));
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard prev_out_guard(prev_out);
Tensor::MappingGuard prev_cell_guard(prev_cell);
Tensor::MappingGuard lstm_params_guard(lstm_params);
Tensor::MappingGuard output_guard(output);
Tensor::MappingGuard out_cache_guard(out_cache);
Tensor::MappingGuard cell_cache_guard(cell_cache);
const float *input_data = input->data<float>();
const float *prev_out_data = prev_out->data<float>();
const float *prev_cell_data = prev_cell->data<float>();
const float *lstm_params_data = lstm_params->data<float>();
float *output_data = output->mutable_data<float>();
float *out_cache_data = out_cache->mutable_data<float>();
float *cell_cache_data = cell_cache->mutable_data<float>();
for (int b = 0; b < batch; ++b) {
int prev_out_idx = prev_out_delay_;
int prev_cell_idx = prev_cell_delay_;
prev_cell.Clear();
prev_out.Clear();
affine_a_in.Clear();
affine_a_out.Clear();
affine_b_in.Clear();
affine_b_out.Clear();
for (int i = 0; i < chunk; ++i) {
const float *input_ptr = input_data + (b * chunk + i) * input_dim;
float *output_ptr = output_data + (b * chunk + i) * output_dim;
memcpy(prev_out_buf_data,
prev_out_data + b * out_buf_chunk * prev_out_dim_,
sizeof(float) * out_buf_chunk * prev_out_dim_);
memcpy(prev_cell_buf_data,
prev_cell_data + b * cell_buf_chunk * prev_cell_dim_,
sizeof(float) * cell_buf_chunk * prev_cell_dim_);
for (index_t i = 0; i < out_chunk; ++i) {
const float *input_ptr =
input_data + (b * chunk + forward_indexes_[i]) * input_dim;
float *output_ptr = output_data + (b * out_chunk + i) * output_dim;
// Append
memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(float));
if (prev_out_idx >= 0) {
memcpy(affine_a_in_data + input_dim,
prev_out_data + prev_out_idx % out_buf_chunk * prev_out_dim_,
prev_out_dim_ * sizeof(float));
}
memcpy(affine_a_in_data + input_dim,
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_,
prev_out_dim_ * sizeof(float));
// Affine
gemv_.Compute(context,
weights_a,
......@@ -256,15 +327,13 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
false,
&affine_a_out);
// Prepare LSTMNonlinear input and output pointer
float *prev_cell_ptr =
prev_cell_idx < 0 ? nullptr :
prev_cell_data + prev_cell_idx % cell_buf_chunk * prev_cell_dim_;
float *curr_cell_ptr =
prev_cell_data + i % cell_buf_chunk * prev_cell_dim_;
float *lstm_cell_ptr =
prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_;
float *curr_cell_ptr = lstm_cell_ptr;
// LSTMNonlinear
LSTMNonlinearKernel(context,
affine_a_out_data,
prev_cell_ptr,
lstm_cell_ptr,
nullptr,
lstm_params_data,
false,
......@@ -289,16 +358,36 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
affine_b_out_data,
output_dim * sizeof(float));
// Update
float *curr_out_ptr = prev_out_data + i % out_buf_chunk * prev_out_dim_;
float *curr_out_ptr =
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_;
CopyAndUpdateCell(affine_b_out_data + prev_out_offset_,
prev_out_dim_,
scale_,
curr_out_ptr);
prev_out_idx++;
prev_cell_idx++;
for (size_t k = 0; k < out_cache_indexes_.size(); ++k) {
if (i == out_cache_indexes_[k]) {
const index_t idx = b * out_buf_chunk + k;
float *out_cache_ptr =
out_cache_data + idx * prev_out_dim_;
memcpy(out_cache_ptr,
curr_out_ptr,
sizeof(float) * prev_out_dim_);
}
}
for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) {
if (i == cell_cache_indexes_[k]) {
const index_t idx = b * cell_buf_chunk + k;
float *cell_cache_ptr =
cell_cache_data + idx * prev_cell_dim_;
memcpy(cell_cache_ptr,
curr_cell_ptr,
sizeof(float) * prev_cell_dim_);
}
}
}
}
return MaceStatus::MACE_SUCCESS;
}
......@@ -311,6 +400,10 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
int has_bias_a_;
int has_bias_b_;
float scale_;
int subsample_factor_;
std::vector<index_t> forward_indexes_;
std::vector<index_t> cell_cache_indexes_;
std::vector<index_t> out_cache_indexes_;
#ifdef MACE_ENABLE_NEON
arm::fp32::Gemv gemv_;
......@@ -318,8 +411,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
ref::Gemv<float> gemv_;
#endif // MACE_ENABLE_NEON
MACE_OP_INPUT_TAGS(INPUT, WEIGHTS_A, PARAMS, WEIGHTS_B);
MACE_OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, PREV_OUT, PREV_CELL, WEIGHTS_A, PARAMS, WEIGHTS_B);
MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_CACHE, CELL_CACHE);
};
void RegisterDynamicLSTM(OpRegistryBase *op_registry) {
......
......@@ -12,17 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// This Op is for fused StatisticsExtraction, StatisticsPooling and
// Round Components in Kaldi.
// This Op is for fused StatisticsExtraction and StatisticsPooling
// Components in Kaldi.
// This op is used to extract moving-average mean and standard-deviation
// statistics of input data.
// 'input_indexes' indicates which frames will be used for extract statistics.
// 'output_indexes' indicates which frames of outputs will be used to
// 'forward_indexes' indicates which frames of input will be used for
// extraction.
// save statistics results.
// 'modulus' will be used for extent results to all frames.
// 'start_index' and 'end_index' indicate time indexes of output frames.
// 'forward_indexes' and 'count' were from precomputed index in kaldi.
// Reference to
// Reference to tools/extract_pooling.py and
// http://kaldi-asr.org/doc/nnet-general-component_8h_source.html#l00158
#include <functional>
......@@ -42,7 +40,6 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
public:
explicit ExtractPoolingOp(OpConstructContext *context)
: Operation(context),
modulus_(Operation::GetOptionalArg<int>("modulus", 1)),
include_variance_(
static_cast<bool>(
Operation::GetOptionalArg<int>("include_variance", 0))),
......@@ -50,39 +47,36 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
Operation::GetOptionalArg<int>("num_log_count", 0)),
variance_floor_(
Operation::GetOptionalArg<float>("variance_floor", 1.0e-10)),
input_indexes_(Operation::GetRepeatedArgs<int>("input_indexes")),
output_indexes_(Operation::GetRepeatedArgs<int>("output_indexes")),
forward_indexes_(Operation::GetRepeatedArgs<int>("forward_indexes")),
counts_(Operation::GetRepeatedArgs<float>("counts")),
input_time_range_(Operation::GetRepeatedArgs<int>("input_time_range")),
output_time_range_(
Operation::GetRepeatedArgs<int>("output_time_range")) {}
counts_(Operation::GetRepeatedArgs<float>("counts")) {}
inline void Validate() {
const Tensor *input = this->Input(0);
const unsigned int rank = static_cast<unsigned int>(input->dim_size());
MACE_CHECK(rank >= 2,
"ExtractPooling only supports input dim size >= 2");
MACE_CHECK(counts_.size() * 2 == forward_indexes_.size(),
"counts length(", counts_.size(),
") should be 2 times of forward_indexes length(",
forward_indexes_.size(), ").");
for (size_t i = 0; i < counts_.size(); ++i) {
MACE_CHECK(static_cast<index_t>(counts_[i]) ==
forward_indexes_[2 * i + 1] - forward_indexes_[2 * i],
"invalid forward indexes and counts values");
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
Validate();
const std::vector<index_t> &input_shape = input->shape();
const index_t dim_size = input_shape.size();
MACE_CHECK(dim_size >= 2,
"ExtractPooling only supports input dim size >= 2");
MACE_CHECK(modulus_ >= 1,
"ExtractPooling's pooling size should be greater than zero.");
MACE_CHECK(input_time_range_.size() == 2 && output_time_range_.size() == 2
&& counts_.size() * 2 == forward_indexes_.size()
&& counts_.size() == output_indexes_.size());
int in_start_index = input_time_range_[0];
int out_start_index = output_time_range_[0];
int out_end_index = output_time_range_[1];
MACE_CHECK(out_end_index >= out_start_index
&& input_time_range_[1] >= input_time_range_[0],
"end index should be greater than start index.");
const index_t output_chunk = out_end_index - out_start_index + 1;
const unsigned int dim_size = static_cast<unsigned int>(input->dim_size());
const index_t input_dim = input_shape[dim_size - 1];
const index_t chunk = input_shape[dim_size - 2];
MACE_CHECK(chunk == input_time_range_[1] - input_time_range_[0] + 1,
"input chunk should be equal to end - start + 1.");
const index_t output_chunk = counts_.size();
const index_t batch =
std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
std::multiplies<index_t>());
......@@ -94,10 +88,6 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
output_shape[dim_size - 2] = output_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t num_input_indexes = input_indexes_.size();
const index_t num_output_indexes = output_indexes_.size();
MACE_CHECK(num_input_indexes > 0 && num_output_indexes > 0,
"ExtractPooling's input_indexes or output_indexes is empty.");
const index_t extract_out_size = PadAlignSize(output_dim * sizeof(float));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
......@@ -117,7 +107,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
&thread_pool = context->device()->cpu_runtime()->thread_pool();
for (index_t b = 0; b < batch; ++b) {
for (index_t i = 0; i < num_output_indexes; ++i) {
for (index_t i = 0; i < output_chunk; ++i) {
int start = forward_indexes_[2 * i];
int end = forward_indexes_[2 * i + 1];
float count = counts_[i];
......@@ -139,7 +129,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
float variance = 0.f;
for (int t = start; t < end; ++t) {
index_t input_index =
(b * chunk + input_indexes_[t] - in_start_index)
(b * chunk + t)
* input_dim;
float x = input_data[input_index + d];
mean += x;
......@@ -163,30 +153,15 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
float mean = 0.f;
for (int t = start; t < end; ++t) {
index_t input_index =
(b * chunk + input_indexes_[t] - in_start_index)
* input_dim;
(b * chunk + t) * input_dim;
mean += input_data[input_index + d];
}
extract_out_data[d + num_log_count_] = mean * mean_scale;
}
}, 0, input_dim, 1);
}
int output_start = output_indexes_[i] < out_start_index ?
out_start_index : output_indexes_[i];
int output_end = output_indexes_[i] + modulus_;
output_end = output_end > out_end_index ?
out_end_index + 1 :
output_end;
thread_pool.Compute1D([=](index_t start0,
index_t end0,
index_t step0) {
for (index_t idx = start0; idx < end0; idx += step0) {
memcpy(output_data + (b * output_chunk + idx - out_start_index)
* output_dim,
extract_out_data, output_dim * sizeof(float));
}
}, output_start, output_end, 1);
memcpy(output_data + (b * output_chunk + i) * output_dim,
extract_out_data, output_dim * sizeof(float));
}
}
......@@ -194,16 +169,11 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
}
private:
int modulus_;
bool include_variance_;
int num_log_count_;
float variance_floor_;
std::vector<int> input_indexes_;
std::vector<int> output_indexes_;
std::vector<int> forward_indexes_;
std::vector<float> counts_;
std::vector<int> input_time_range_;
std::vector<int> output_time_range_;
};
void RegisterExtractPooling(OpRegistryBase *op_registry) {
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This Op is for IfDefined descriptor in Kaldi.
// It defines time offset.
// If time index <= offset, using zeros as output.
// forward_indexes: indicates which frames will be used for computation.
// Because of the model's subsampling, this is pre-computed
// in kaldi-onnx.
// cache_forward_indexes: indicates which frames of cached previous output
// will be used here. If there is only one input,
// this parameter will be empty.
#include <functional>
#include <memory>
#include "mace/core/operator.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class IfDefinedOp;
template <typename T>
class IfDefinedOp<DeviceType::CPU, T> : public Operation {
public:
explicit IfDefinedOp(OpConstructContext *context)
: Operation(context),
forward_indexes_(
Operation::GetRepeatedArgs<index_t>("forward_indexes")),
cache_forward_indexes_(
Operation::GetRepeatedArgs<index_t>("cache_forward_indexes")) {}
inline void Validate() {
MACE_CHECK(this->InputSize() <= 2,
"IfDefined Op should have at most 2 inputs.");
const Tensor *input = this->Input(INPUT);
const unsigned int rank = static_cast<unsigned int>(input->dim_size());
MACE_CHECK(rank >= 2, "IfDefined's input should have at least 2 dims.");
const index_t input_chunk = input->dim(rank - 2);
for (size_t i = 0; i < forward_indexes_.size(); ++i) {
MACE_CHECK(forward_indexes_[i] < input_chunk,
"forward index is over range.");
}
for (size_t i = 0; i < cache_forward_indexes_.size(); ++i) {
MACE_CHECK(cache_forward_indexes_[i] < input_chunk &&
cache_forward_indexes_[i] >= 0 ,
"index is over range.");
}
if (this->InputSize() == 2) {
size_t cache_count = 0;
for (size_t i = 0; i < forward_indexes_.size(); ++i) {
if (forward_indexes_[i] < 0)
cache_count++;
else
break;
}
MACE_CHECK(cache_forward_indexes_.size() == cache_count,
"IfDefined's cache forward index size:",
cache_forward_indexes_.size(),
" != forward indexes' negative part length:",
cache_count);
for (size_t i = 0; i < cache_forward_indexes_.size(); ++i) {
MACE_CHECK(cache_forward_indexes_[i] < input_chunk &&
cache_forward_indexes_[i] >= 0,
"cache forward index is over range.");
}
const Tensor *cache_input = this->Input(CACHE_INPUT);
MACE_CHECK(cache_input->dim_size() == input->dim_size(),
"two inputs should have the same rank");
for (unsigned int k = 0; k < rank; ++k) {
MACE_CHECK(input->dim(k) == cache_input->dim(k),
"Two inputs should have the same shape");
}
}
}
void DelayCopy(OpContext *context,
const T *input_data,
const index_t batch,
const index_t chunk,
const index_t dim,
const std::vector<index_t> &fwd_idxs,
T *output_data) {
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t i = start0; i < end0; i += step0) {
for (index_t j = start1; j < end1; j += step1) {
if (fwd_idxs[j] >= 0) {
memcpy(output_data + (i * chunk + j) * dim,
input_data + (i * chunk + fwd_idxs[j]) * dim,
dim * sizeof(T));
}
}
}
}, 0, batch, 1, 0, fwd_idxs.size(), 1);
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
Validate();
index_t rank = input->dim_size();
const std::vector<index_t> &input_shape = input->shape();
const index_t batch =
std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
std::multiplies<index_t>());
const index_t chunk = input_shape[rank - 2];
const index_t dim = input_shape[rank - 1];
std::vector<index_t> output_shape(input->shape());
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
output->Clear();
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
DelayCopy(context,
input_data,
batch,
chunk,
dim,
forward_indexes_,
output_data);
if (this->InputSize() == 2 && cache_forward_indexes_.size() > 0) {
const Tensor *cache_input = this->Input(CACHE_INPUT);
Tensor::MappingGuard cache_input_guard(cache_input);
const T *cache_input_data = cache_input->data<T>();
DelayCopy(context,
cache_input_data,
batch,
chunk,
dim,
cache_forward_indexes_,
output_data);
}
return MaceStatus::MACE_SUCCESS;
}
private:
std::vector<index_t> forward_indexes_;
std::vector<index_t> cache_forward_indexes_;
private:
MACE_OP_INPUT_TAGS(INPUT, CACHE_INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
void RegisterIfDefined(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp,
DeviceType::CPU, float);
}
} // namespace ops
} // namespace mace
......@@ -42,7 +42,7 @@ extern void RegisterFill(OpRegistryBase *op_registry);
extern void RegisterFullyConnected(OpRegistryBase *op_registry);
extern void RegisterGather(OpRegistryBase *op_registry);
extern void RegisterIdentity(OpRegistryBase *op_registry);
extern void RegisterDelay(OpRegistryBase *op_registry);
extern void RegisterIfDefined(OpRegistryBase *op_registry);
extern void RegisterInferConv2dShape(OpRegistryBase *op_registry);
extern void RegisterKaldiBatchNorm(OpRegistryBase *op_registry);
extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry);
......@@ -56,6 +56,7 @@ extern void RegisterPadContext(OpRegistryBase *op_registry);
extern void RegisterPNorm(OpRegistryBase *op_registry);
extern void RegisterPooling(OpRegistryBase *op_registry);
extern void RegisterReduce(OpRegistryBase *op_registry);
extern void RegisterReplaceIndex(OpRegistryBase *op_registry);
extern void RegisterPriorBox(OpRegistryBase *op_registry);
extern void RegisterReshape(OpRegistryBase *op_registry);
extern void RegisterResizeBicubic(OpRegistryBase *op_registry);
......@@ -74,6 +75,7 @@ extern void RegisterSqrDiffMean(OpRegistryBase *op_registry);
extern void RegisterSqueeze(OpRegistryBase *op_registry);
extern void RegisterStack(OpRegistryBase *op_registry);
extern void RegisterStridedSlice(OpRegistryBase *op_registry);
extern void RegisterSubsample(OpRegistryBase *op_registry);
extern void RegisterSumGroup(OpRegistryBase *op_registry);
extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry);
extern void RegisterTile(OpRegistryBase *op_registry);
......@@ -119,7 +121,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
ops::RegisterFullyConnected(this);
ops::RegisterGather(this);
ops::RegisterIdentity(this);
ops::RegisterDelay(this);
ops::RegisterIfDefined(this);
ops::RegisterInferConv2dShape(this);
ops::RegisterKaldiBatchNorm(this);
ops::RegisterLocalResponseNorm(this);
......@@ -133,6 +135,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
ops::RegisterPNorm(this);
ops::RegisterPooling(this);
ops::RegisterReduce(this);
ops::RegisterReplaceIndex(this);
ops::RegisterPriorBox(this);
ops::RegisterReshape(this);
ops::RegisterResizeBicubic(this);
......@@ -151,6 +154,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
ops::RegisterStridedSlice(this);
ops::RegisterSqrDiffMean(this);
ops::RegisterSqueeze(this);
ops::RegisterSubsample(this);
ops::RegisterSumGroup(this);
ops::RegisterTargetRMSNorm(this);
ops::RegisterTile(this);
......
......@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// This Op is for IfDefined descriptor in Kaldi.
// It defines time offset.
// If time index <= offset, using zeros as output.
// This Op is for ReplaceIndex in Kaldi.
// Usually used for ivector inputs.
// It copies ivector to each frame of the output.
// forward_indexes: is the pre-computed indexes for output frames.
#include <functional>
#include <memory>
......@@ -24,62 +25,77 @@
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class DelayOp;
template<DeviceType D, typename T>
class ReplaceIndexOp;
template <typename T>
class DelayOp<DeviceType::CPU, T> : public Operation {
template<typename T>
class ReplaceIndexOp<DeviceType::CPU, T> : public Operation {
public:
explicit DelayOp(OpConstructContext *context)
explicit ReplaceIndexOp(OpConstructContext *context)
: Operation(context),
offset_(Operation::GetOptionalArg<int>("offset", 0)) {}
forward_indexes_(
Operation::GetRepeatedArgs<index_t>("forward_indexes")) {}
inline void Validate() {
const Tensor *input = this->Input(0);
const unsigned int rank = static_cast<unsigned int>(input->dim_size());
MACE_CHECK(rank >= 2, "ReplaceIndex's input should have at least 2 dims.");
const index_t input_chunk = input->dim(rank - 2);
for (size_t i = 0; i < forward_indexes_.size(); ++i) {
MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0 ,
"index is over range.");
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
MACE_CHECK(offset_ < 0, "offset param should be negative.");
index_t rank = input->dim_size();
MACE_CHECK(rank >= 2, "input's rank should >= 2.");
Validate();
const std::vector<index_t> &input_shape = input->shape();
const index_t batch =
std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
std::accumulate(input->shape().begin(), input->shape().end() - 2, 1,
std::multiplies<index_t>());
const index_t chunk = input_shape[rank - 2];
const index_t rank = input->dim_size();
const index_t num_ivectors = input_shape[rank - 2];
const index_t dim = input_shape[rank - 1];
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
output->Clear();
const index_t input_stride = num_ivectors * dim;
const index_t out_chunk = forward_indexes_.size();
const index_t output_stride = out_chunk * dim;
if (chunk <= -offset_)
return MaceStatus::MACE_SUCCESS;
std::vector<index_t> output_shape = input->shape();
output_shape[rank - 2] = out_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t i = start0; i < end0; i += step0) {
for (index_t j = start1; j < end1; j += step1) {
memcpy(output_data + (i * chunk + j - offset_) * dim,
input_data + (i * chunk + j) * dim,
for (index_t b = start0; b < end0; b += step0) {
for (index_t i = start1; i < end1; i += step1) {
memcpy(output_data + b * output_stride + i * dim,
input_data + b * input_stride + forward_indexes_[i] * dim,
dim * sizeof(T));
}
}
}, 0, batch, 1, 0, chunk + offset_, 1);
}, 0, batch, 1, 0, out_chunk, 1);
return MaceStatus::MACE_SUCCESS;
}
private:
int offset_;
std::vector<index_t> forward_indexes_;
};
void RegisterDelay(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Delay", DelayOp,
void RegisterReplaceIndex(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
DeviceType::CPU, float);
}
......
......@@ -22,6 +22,9 @@
// if const_component_dim_ != 0, const_dim_ will be used to determine which
// row of "in" we copy the last part of each row of "out" from (this part is
// not subject to splicing, it's assumed constant for each frame of "input".
// forward_indexes and forward_const_indexes indicate which frames will
// be used for computation, and they are precomputed in kaldi-onnx converter
// becase of supporting subsample.
#include <functional>
#include <memory>
......@@ -40,21 +43,45 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
public:
explicit SpliceOp(OpConstructContext *context)
: Operation(context),
context_(Operation::GetRepeatedArgs<int>("context")),
context_(Operation::GetRepeatedArgs<index_t>("context")),
const_dim_(
Operation::GetOptionalArg<int>("const_component_dim", 0)) {}
Operation::GetOptionalArg<int>("const_component_dim", 0)),
forward_indexes_(
Operation::GetRepeatedArgs<index_t>("forward_indexes")),
forward_const_indexes_(
Operation::GetRepeatedArgs<index_t>("forward_const_indexes")) {}
inline void Validate() {
MACE_CHECK(context_.size() > 0)
<< "The context param should not be empty in Splice Op.";
MACE_CHECK(forward_indexes_.size() % context_.size() == 0,
"Splice's forward indexes should be multiply of num splice.");
const Tensor *input = this->Input(0);
const unsigned int rank = static_cast<unsigned int>(input->dim_size());
MACE_CHECK(rank >= 2, "Splice's input should have at least 2 dims.");
MACE_CHECK(input->dim(rank - 1) > const_dim_,
"input dim:", input->dim(rank - 1),
"should be greater than const dim:", const_dim_);
const index_t input_chunk = input->dim(rank - 2);
for (size_t i = 0; i < forward_indexes_.size(); ++i) {
MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0)
<< " forward index:" << forward_indexes_[i] << " input shape:"
<< input->dim(0) << "," << input->dim(1) << "," << input->dim(2);
}
for (size_t i = 0; i < forward_const_indexes_.size(); ++i) {
MACE_CHECK(forward_const_indexes_[i] < input_chunk &&
forward_const_indexes_[i] >= 0 ,
"index is over range.");
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
const Tensor *input = this->Input(0);
MACE_CHECK(context_.size() > 0)
<< "The context param should not be empty in Splice Op.";
MACE_CHECK(input->dim_size() >= 2)
<< "Splice's input's rank should be greater than 2.";
Tensor *output = this->Output(0);
Validate();
const std::vector<index_t> &input_shape = input->shape();
const index_t batch =
std::accumulate(input->shape().begin(), input->shape().end() - 2, 1,
std::multiplies<index_t>());
......@@ -65,14 +92,10 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
const index_t num_splice = static_cast<index_t>(context_.size());
const index_t dim = input_dim - const_dim_;
const index_t left_context = context_[0];
const index_t right_context = context_[num_splice -1];
const index_t out_chunk = chunk - (right_context - left_context);
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
MACE_CHECK(input_dim > const_dim_,
"input dim:", input_dim,
"should be greater than const dim:", const_dim_);
const index_t out_chunk = forward_indexes_.size() / num_splice;
const index_t output_dim = dim * num_splice + const_dim_;
const index_t output_stride = out_chunk * output_dim;
......@@ -86,38 +109,48 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
for (int b = 0; b < batch; ++b) {
for (index_t i = 0; i < out_chunk; ++i) {
for (index_t c = 0; c < num_splice; ++c) {
const index_t offset = i + context_[c] - left_context;
T *output_base =
output_data + b * output_stride + i * output_dim + c * dim;
const T *input_base =
input_data + b * input_stride + offset * input_dim;
memcpy(output_base, input_base, dim * sizeof(T));
thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1,
index_t start2, index_t end2, index_t step2) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t i = start1; i < end1; i += step1) {
for (index_t c = start2; c < end2; c += step2) {
const index_t pos = forward_indexes_[i * num_splice + c];
T *output_base =
output_data + b * output_stride + i * output_dim + c * dim;
const T *input_base =
input_data + b * input_stride + pos * input_dim;
memcpy(output_base, input_base, dim * sizeof(T));
}
}
}
}
}, 0, batch, 1, 0, out_chunk, 1, 0, num_splice, 1);
if (const_dim_ > 0) {
const index_t output_offset = output_dim - const_dim_;
const index_t input_offset = dim;
for (int b = 0; b < batch; ++b) {
for (index_t i = 0; i < out_chunk; ++i) {
T *output_base = output_data + b * output_stride + i * output_dim;
const T *input_base = input_data + b * input_stride + i * input_dim;
memcpy(output_base + output_offset,
input_base + input_offset,
const_dim_ * sizeof(T));
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t i = start1; i < end1; i += step1) {
T *output_base = output_data + b * output_stride +
i * output_dim + output_offset;
const T *input_base =
input_data + b * input_stride +
forward_const_indexes_[i] * input_dim + dim;
memcpy(output_base, input_base,
const_dim_ * sizeof(T));
}
}
}
}, 0, batch, 1, 0, out_chunk, 1);
}
return MaceStatus::MACE_SUCCESS;
}
private:
std::vector<int> context_;
std::vector<index_t> context_;
int const_dim_;
std::vector<index_t> forward_indexes_;
std::vector<index_t> forward_const_indexes_;
};
void RegisterSplice(OpRegistryBase *op_registry) {
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This Op is created for subsample frames for Kaldi model's inference.
// forward_indexes: indicates which frames will be selected as output.
#include <functional>
#include <memory>
#include "mace/core/operator.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
template<DeviceType D, typename T>
class SubsampleOp;
template<typename T>
class SubsampleOp<DeviceType::CPU, T> : public Operation {
public:
explicit SubsampleOp(OpConstructContext *context)
: Operation(context),
forward_indexes_(
Operation::GetRepeatedArgs<index_t>("forward_indexes")) {}
inline void Validate() {
const Tensor *input = this->Input(0);
const unsigned int rank =
static_cast<unsigned int>(input->dim_size());
MACE_CHECK(rank >= 2,
"Subsample's input should have at least 2 dims.");
const index_t input_chunk = input->dim(rank - 2);
for (size_t i = 0; i < forward_indexes_.size(); ++i) {
MACE_CHECK(forward_indexes_[i] < input_chunk &&
forward_indexes_[i] >= 0 ,
"index is over range.");
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
Validate();
const std::vector<index_t> &input_shape = input->shape();
const index_t batch =
std::accumulate(input->shape().begin(),
input->shape().end() - 2, 1,
std::multiplies<index_t>());
const index_t rank = input->dim_size();
const index_t chunk = input_shape[rank - 2];
const index_t dim = input_shape[rank - 1];
const index_t input_stride = chunk * dim;
const index_t out_chunk = forward_indexes_.size();
const index_t output_stride = out_chunk * dim;
std::vector<index_t> output_shape = input->shape();
output_shape[rank - 2] = out_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t i = start1; i < end1; i += step1) {
T *output_base =
output_data + b * output_stride + i * dim;
const T *input_base =
input_data + b * input_stride + forward_indexes_[i] * dim;
memcpy(output_base, input_base, dim * sizeof(T));
}
}
}, 0, batch, 1, 0, out_chunk, 1);
return MaceStatus::MACE_SUCCESS;
}
private:
std::vector<index_t> forward_indexes_;
};
void RegisterSubsample(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp,
DeviceType::CPU, float);
}
} // namespace ops
} // namespace mace
......@@ -71,7 +71,6 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
return result;
}
void NormalizePerRow(const float *data,
const index_t data_len,
float d_scale,
......@@ -105,9 +104,9 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
std::multiplies<index_t>());
if (block_dim_ == 0) block_dim_ = static_cast<int>(input_dim);
MACE_CHECK(input_dim % block_dim_ == 0, "block_dim must divide input_dim!");
const index_t output_dim = add_log_stddev_ ?
const index_t output_dim = add_log_stddev_ > 0 ?
input_dim + (input_dim / block_dim_) : input_dim;
std::vector<index_t> output_shape = input->shape();
std::vector<index_t> output_shape(input_shape);
output_shape[dim_size - 1] = output_dim;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
......@@ -140,7 +139,6 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
}
}, 0, num_rows, 1);
return MaceStatus::MACE_SUCCESS;
}
......
......@@ -28,12 +28,8 @@ void TestExtractPooling(const std::vector<index_t> &input_shape,
const int modulus,
const int num_log_count,
const int include_variance,
const std::vector<int> &input_time_range,
const std::vector<int> &input_indexes,
const std::vector<int> &forward_indexes,
const std::vector<float> &counts,
const std::vector<int> &output_indexes,
const std::vector<int> &output_time_range,
const std::vector<index_t> &output_shape,
const std::vector<float> &output_value) {
// Construct graph
......@@ -44,12 +40,8 @@ void TestExtractPooling(const std::vector<index_t> &input_shape,
.AddIntArg("modulus", modulus)
.AddIntArg("include_variance", include_variance)
.AddIntArg("num_log_count", num_log_count)
.AddIntsArg("input_indexes", input_indexes)
.AddIntsArg("output_indexes", output_indexes)
.AddIntsArg("forward_indexes", forward_indexes)
.AddFloatsArg("counts", counts)
.AddIntsArg("input_time_range", input_time_range)
.AddIntsArg("output_time_range", output_time_range)
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
......@@ -63,123 +55,52 @@ void TestExtractPooling(const std::vector<index_t> &input_shape,
TEST_F(ExtractPoolingTest, SimpleCPU) {
TestExtractPooling<DeviceType::CPU, float>(
{3, 20, 3},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60},
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179},
9, 0, 0,
{-2, 17},
{0, 3, 6, 9, 12, 15},
{0, 6, 2, 6},
{6, 4},
{0, 9},
{0, 17},
{3, 18, 3},
{29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5});
{3, 2, 3},
{7.5, 8.5, 9.5, 10.5, 11.5, 12.5,
67.5, 68.5, 69.5, 70.5, 71.5, 72.5,
127.5, 128.5, 129.5, 130.5, 131.5, 132.5});
}
TEST_F(ExtractPoolingTest, SimpleCPUWithVariance) {
TestExtractPooling<DeviceType::CPU, float>(
{3, 20, 3},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60},
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179},
9, 1, 1,
{-2, 17},
{0, 3, 6, 9, 12, 15},
{0, 6, 2, 6},
{6, 4},
{0, 9},
{0, 17},
{3, 18, 7},
{1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623});
{3, 2, 7},
{1.7917595, 7.5, 8.5, 9.5, 5.1234756, 5.1234756, 5.1234756,
1.3862944, 10.5, 11.5, 12.5, 3.354102, 3.354102, 3.354102,
1.7917595, 67.5, 68.5, 69.5, 5.1234756, 5.1234756, 5.1234756,
1.3862944, 70.5, 71.5, 72.5, 3.354102, 3.354102, 3.354102,
1.7917595, 127.5, 128.5, 129.5, 5.1234756, 5.1234756, 5.1234756,
1.3862944, 130.5, 131.5, 132.5, 3.354102, 3.354102, 3.354102});
}
} // namespace test
......
......@@ -26,6 +26,8 @@ void TestSplice(const std::vector<index_t> &input_shape,
const std::vector<T> &input,
const std::vector<int> &context,
const int const_dim,
const std::vector<int> &forward_indexes,
const std::vector<int> &forward_const_indexes,
const std::vector<index_t> &output_shape,
const std::vector<T> &output) {
OpsTestNet net;
......@@ -38,6 +40,8 @@ void TestSplice(const std::vector<index_t> &input_shape,
.Output("Output")
.AddIntsArg("context", context)
.AddIntArg("const_component_dim", const_dim)
.AddIntsArg("forward_indexes", forward_indexes)
.AddIntsArg("forward_const_indexes", forward_const_indexes)
.Finalize(net.NewOperatorDef());
net.RunOp();
......@@ -53,6 +57,8 @@ TEST_F(SpliceOpTest, WithoutConstDim) {
{1, 7, 2},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
{-2, -1, 0, 1, 2}, 0,
{0, 1, 2, 3, 4, 1, 2, 3, 4, 5, 2, 3, 4, 5, 6},
{},
{1, 3, 10},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
......@@ -68,8 +74,10 @@ TEST_F(SpliceOpTest, WithConstDim) {
4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
{-2, -1, 0, 1, 2}, 7,
{0, 1, 2, 3, 4},
{2},
{1, 1, 22},
{1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10});
{1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8, 9, 10, 11, 12});
}
} // namespace test
} // namespace ops
......
# Copyright 2019 The MACE Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
variance_floor = 1.0e-10
input_data = np.arange(180).reshape(3, 20, 3).astype(np.float32)
print("input data:", input_data)
num_log_count = 0
include_var = 0
forward_indexes = [0, 6, 2, 6]
counts = [6, 4]
input_dim = input_data.shape[-1]
input_chunk = input_data.shape[-2]
out_chunk = len(counts)
batch = input_data.size / (input_dim * input_chunk)
input_data.reshape(batch, input_chunk, input_dim)
output_dim = input_dim
if include_var > 0:
output_dim += input_dim
if num_log_count > 0:
output_dim += num_log_count
output_data = np.zeros((batch, out_chunk, output_dim), dtype=np.float32)
for b in range(0, batch):
for i in range(0, out_chunk):
start = forward_indexes[2 * i]
end = forward_indexes[2 * i + 1]
count = counts[i]
mean_scale = 1.0 / count
log_count = math.log(count)
if num_log_count > 0:
for n in range(0, num_log_count):
output_data[b, i, n] = log_count
for d in range(0, input_dim):
mean = 0.0
variance = 0.0
for t in range(start, end):
x = input_data[b, t, d]
mean += x
variance += x * x
mean = mean * mean_scale
output_data[b, i, d + num_log_count] = mean
if include_var > 0:
variance = variance * mean_scale - mean * mean
idx = d + input_dim + num_log_count
if variance < variance_floor:
output_data[b, i, idx] = math.sqrt(variance_floor)
else:
output_data[b, i, idx] = math.sqrt(variance)
print("output data:", output_data)
print("output data shape:", output_data.shape)
......@@ -100,7 +100,6 @@ MaceSupportedOps = [
'Conv2D',
'Crop',
'Deconv2D',
'Delay',
'DepthToSpace',
'DepthwiseConv2d',
'DepthwiseDeconv2d',
......@@ -112,6 +111,7 @@ MaceSupportedOps = [
'FullyConnected',
'Gather',
'Identity',
'IfDefined',
'InferConv2dShape',
'KaldiBatchNorm',
'LocalResponseNorm',
......@@ -128,6 +128,7 @@ MaceSupportedOps = [
'Proposal',
'Quantize',
'Reduce',
'ReplaceIndex',
'Reshape',
'ResizeBicubic',
'ResizeBilinear',
......@@ -147,6 +148,7 @@ MaceSupportedOps = [
'SpaceToBatchND',
'SpaceToDepth',
'SqrDiffMean',
'Subsample',
'SumGroup',
'TargetRMSNorm',
'Transpose',
......@@ -269,6 +271,8 @@ class MaceKeyword(object):
mace_reverse_str = 'reverse'
mace_const_data_num_arg_str = 'const_data_num'
mace_coeff_str = 'coeff'
mace_input_indexes_str = 'input_indexes'
mace_output_indexes_str = 'output_indexes'
mace_p_str = 'p'
mace_nor_var_str = 'normalize_variance'
mace_across_ch_str = 'across_channels'
......
......@@ -152,7 +152,9 @@ OnnxSupportedOps = [
# 'ReduceSum',
# 'ReduceSumSquare',
'Relu',
'ReplaceIndex',
'Reshape',
'Round',
'Scale',
# 'Scan',
# 'Selu',
......@@ -171,6 +173,7 @@ OnnxSupportedOps = [
'Sqrt',
'Squeeze',
'Sub',
'Subsample',
'Sum',
'SumGroup',
# 'Tan',
......@@ -363,7 +366,7 @@ class OnnxConverter(base_converter.ConverterInterface):
OnnxOpType.Mul.name: self.convert_eltwise,
OnnxOpType.Neg.name: self.convert_eltwise,
OnnxOpType.Normalize: self.convert_normalize,
OnnxOpType.Offset.name: self.convert_identity,
OnnxOpType.Offset.name: self.convert_subsample,
OnnxOpType.Pad.name: self.convert_pad,
OnnxOpType.PadContext.name: self.convert_pad_context,
OnnxOpType.PNorm.name: self.convert_pnorm,
......@@ -376,6 +379,8 @@ class OnnxConverter(base_converter.ConverterInterface):
OnnxOpType.ReduceMean.name: self.convert_reduce,
OnnxOpType.ReduceMin.name: self.convert_reduce,
OnnxOpType.ReduceProd.name: self.convert_reduce,
OnnxOpType.ReplaceIndex.name: self.convert_replaceindex,
OnnxOpType.Round.name: self.convert_replaceindex,
OnnxOpType.Scale.name: self.convert_eltwise,
OnnxOpType.Shape.name: self.convert_shape,
OnnxOpType.Sigmoid.name: self.convert_activation,
......@@ -387,6 +392,7 @@ class OnnxConverter(base_converter.ConverterInterface):
OnnxOpType.Sqrt.name: self.convert_eltwise,
OnnxOpType.Squeeze.name: self.convert_squeeze,
OnnxOpType.Sub.name: self.convert_eltwise,
OnnxOpType.Subsample.name: self.convert_subsample,
OnnxOpType.Sum.name: self.convert_eltwise,
OnnxOpType.SumGroup.name: self.convert_sum_group,
OnnxOpType.Tanh.name: self.convert_activation,
......@@ -839,56 +845,30 @@ class OnnxConverter(base_converter.ConverterInterface):
op = self.convert_general_op(node)
op.type = MaceOp.DynamicLSTM.name
if 'prev_out_delay' in node.attrs:
prev_out_delay = node.attrs['prev_out_delay']
mace_check(prev_out_delay < 0,
"dynamic's prev_out_delay should <= 0.")
prev_out_delay_arg = op.arg.add()
prev_out_delay_arg.name = 'prev_out_delay'
prev_out_delay_arg.i = prev_out_delay
if 'prev_cell_delay' in node.attrs:
prev_cell_delay = node.attrs['prev_cell_delay']
mace_check(prev_cell_delay < 0,
"dynamic's prev_cell_delay should < 0.")
prev_cell_delay_arg = op.arg.add()
prev_cell_delay_arg.name = 'prev_cell_delay'
prev_cell_delay_arg.i = prev_cell_delay
if 'prev_out_offset' in node.attrs:
prev_out_offset = node.attrs['prev_out_offset']
mace_check(prev_out_offset >= 0,
"dynamic's prev_out_offset should >= 0.")
prev_out_offset_arg = op.arg.add()
prev_out_offset_arg.name = 'prev_out_offset'
prev_out_offset_arg.i = prev_out_offset
if 'prev_out_dim' in node.attrs:
prev_out_dim = node.attrs['prev_out_dim']
mace_check(prev_out_dim > 0,
"dynamic's prev_out_dim should > 0.")
prev_out_dim_arg = op.arg.add()
prev_out_dim_arg.name = 'prev_out_dim'
prev_out_dim_arg.i = prev_out_dim
if 'prev_cell_dim' in node.attrs:
prev_cell_dim = node.attrs['prev_cell_dim']
mace_check(prev_cell_dim > 0,
"dynamic's prev_cell_dim should > 0.")
prev_cell_dim_arg = op.arg.add()
prev_cell_dim_arg.name = 'prev_cell_dim'
prev_cell_dim_arg.i = prev_cell_dim
if 'bias_a' in node.attrs:
bias_a = node.attrs['bias_a']
bias_a_arg = op.arg.add()
bias_a_arg.name = 'bias_a'
bias_a_arg.i = bias_a
if 'bias_b' in node.attrs:
bias_b = node.attrs['bias_b']
bias_b_arg = op.arg.add()
bias_b_arg.name = 'bias_b'
bias_b_arg.i = bias_b
if 'scale' in node.attrs:
scale = node.attrs['scale']
scale_arg = op.arg.add()
scale_arg.name = 'scale'
scale_arg.f = scale
self.copy_node_attr(op, node, 'prev_out_delay',
AttributeType.INT)
self.copy_node_attr(op, node, 'prev_cell_delay',
AttributeType.INT)
self.copy_node_attr(op, node, 'prev_out_offset',
AttributeType.INT)
self.copy_node_attr(op, node, 'prev_out_dim',
AttributeType.INT)
self.copy_node_attr(op, node, 'prev_cell_dim',
AttributeType.INT)
self.copy_node_attr(op, node, 'bias_a',
AttributeType.INT)
self.copy_node_attr(op, node, 'bias_b',
AttributeType.INT)
self.copy_node_attr(op, node, 'scale',
AttributeType.FLOAT)
self.copy_node_attr(op, node, 'subsample_factor',
AttributeType.INT, default=1)
self.copy_node_attr(op, node, 'cell_cache_indexes',
AttributeType.INTS, default=[])
self.copy_node_attr(op, node, 'out_cache_indexes',
AttributeType.INTS, default=[])
self.copy_node_attr(op, node, 'forward_indexes',
AttributeType.INTS)
def convert_clip(self, node):
# If clip's min value is zero,
......@@ -1019,73 +999,8 @@ class OnnxConverter(base_converter.ConverterInterface):
self.copy_node_attr(op, node, 'include_variance', AttributeType.INT)
self.copy_node_attr(op, node, 'num_log_count', AttributeType.INT)
self.copy_node_attr(op, node, 'variance_floor', AttributeType.FLOAT)
self.copy_node_attr(op, node, 'input_time_range', AttributeType.INTS)
self.copy_node_attr(op, node, 'input_indexes', AttributeType.INTS)
if 'output_time_range' in node.attrs:
output_time_range = node.attrs['output_time_range']
mace_check(len(output_time_range) == 2,
"output time range should have two values.")
out_start_index = output_time_range[0]
out_end_index = output_time_range[1]
else:
mace_check('start_index' in node.attrs and
'end_index' in node.attrs,
"'start_index' and 'end_index'"
" are required in ExtractPooling.")
out_start_index = node.attrs['start_index']
out_end_index = node.attrs['end_index']
output_time_range = [out_start_index, out_end_index]
output_time_range_arg = op.arg.add()
output_time_range_arg.name = 'output_time_range'
output_time_range_arg.ints.extend(output_time_range)
mace_check('modulus' in node.attrs,
"'modulus' is required in ExtractPooling.")
mace_check('output_indexes' in node.attrs,
"'output_indexes' is required in ExtractPooling.")
mace_check('counts' in node.attrs,
"'counts' is required in ExtractPooling.")
mace_check('forward_indexes' in node.attrs,
"'forward_indexes' is required in ExtractPooling.")
modulus = node.attrs['modulus']
output_indexes = node.attrs['output_indexes']
counts = node.attrs['counts']
forward_indexes = node.attrs['forward_indexes']
mace_check(len(counts) == len(output_indexes) and
len(forward_indexes) == 2 * len(output_indexes),
"output_indexes length:%s "
"counts length:%s "
"forward_indexes length:%s"
% (len(output_indexes), len(counts), len(forward_indexes)))
new_output_indexes = []
new_forward_indexes = []
new_counts = []
for i in range(len(output_indexes)):
if output_indexes[i] + modulus > out_start_index and\
output_indexes[i] <= out_end_index:
new_output_indexes.append(output_indexes[i])
new_counts.append(counts[i])
new_forward_indexes.append(forward_indexes[2 * i])
new_forward_indexes.append(forward_indexes[2 * i + 1])
modulus_arg = op.arg.add()
modulus_arg.name = 'modulus'
modulus_arg.i = modulus
counts_arg = op.arg.add()
counts_arg.name = 'counts'
counts_arg.floats.extend(new_counts)
forward_indexes_arg = op.arg.add()
forward_indexes_arg.name = 'forward_indexes'
forward_indexes_arg.ints.extend(new_forward_indexes)
output_indexes_arg = op.arg.add()
output_indexes_arg.name = 'output_indexes'
output_indexes_arg.ints.extend(new_output_indexes)
self.copy_node_attr(op, node, 'counts', AttributeType.FLOATS)
self.copy_node_attr(op, node, 'forward_indexes', AttributeType.INTS)
def convert_flatten(self, node):
op = self.convert_general_op(node)
......@@ -1104,19 +1019,14 @@ class OnnxConverter(base_converter.ConverterInterface):
def convert_kaldi_batchnorm(self, node):
op = self.convert_general_op(node)
op.type = MaceOp.KaldiBatchNorm.name
dim = self.copy_node_attr(op, node,
'dim', AttributeType.INT, -1)
block_dim = self.copy_node_attr(op, node,
'block_dim',
dim = self.copy_node_attr(op, node, 'dim', AttributeType.INT, -1)
block_dim = self.copy_node_attr(op, node, 'block_dim',
AttributeType.INT, -1)
epsilon = self.copy_node_attr(op, node,
'epsilon',
epsilon = self.copy_node_attr(op, node, 'epsilon',
AttributeType.FLOAT, 1e-3)
target_rms = self.copy_node_attr(op, node,
'target_rms',
target_rms = self.copy_node_attr(op, node, 'target_rms',
AttributeType.FLOAT, 1.0)
test_mode = self.copy_node_attr(op, node,
'test_mode',
test_mode = self.copy_node_attr(op, node, 'test_mode',
AttributeType.INT, 0)
mace_check(block_dim > 0 and
dim % block_dim == 0 and
......@@ -1165,8 +1075,7 @@ class OnnxConverter(base_converter.ConverterInterface):
scale_name = node.name + 'scale'
offset_name = node.name + 'offset'
scale_value = (
(1.0 / np.sqrt(
scale_value = ((1.0 / np.sqrt(
var_value + epsilon_value)) * gamma_value)
offset_value = (-mean_value * scale_value) + beta_value
self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
......@@ -1267,10 +1176,11 @@ class OnnxConverter(base_converter.ConverterInterface):
if offset == 0:
op.type = MaceOp.Identity.name
else:
op.type = MaceOp.Delay.name
offset_arg = op.arg.add()
offset_arg.name = 'offset'
offset_arg.i = node.attrs['offset']
op.type = MaceOp.IfDefined.name
self.copy_node_attr(op, node, 'forward_indexes',
AttributeType.INTS)
self.copy_node_attr(op, node, 'cache_forward_indexes',
AttributeType.INTS)
def convert_imagescaler(self, node):
op = self.convert_general_op(node)
......@@ -1282,10 +1192,10 @@ class OnnxConverter(base_converter.ConverterInterface):
scale_name = node.name + "_scale"
bias_name = node.name + "_bias"
self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
scale_value)
self.add_tensor(bias_name, bias_value.shape, mace_pb2.DT_FLOAT,
bias_value)
self.add_tensor(scale_name, scale_value.shape,
mace_pb2.DT_FLOAT, scale_value)
self.add_tensor(bias_name, bias_value.shape,
mace_pb2.DT_FLOAT, bias_value)
op.input.extend([scale_name, bias_name])
def convert_lstm(self, node):
......@@ -1399,6 +1309,12 @@ class OnnxConverter(base_converter.ConverterInterface):
keep_dims_arg.name = MaceKeyword.mace_keepdims_str
keep_dims_arg.i = keep_dims
def convert_replaceindex(self, node):
op = self.convert_general_op(node)
op.type = MaceOp.ReplaceIndex.name
self.copy_node_attr(op, node, 'forward_indexes',
AttributeType.INTS)
def convert_reshape(self, node):
op = self.convert_general_op(node)
op.type = MaceOp.Reshape.name
......@@ -1460,11 +1376,17 @@ class OnnxConverter(base_converter.ConverterInterface):
context_arg.ints.extend(context)
if 'const_component_dim' in node.attrs:
const_dim = node.attrs['const_component_dim']
else:
const_dim = 0
const_dim_arg = op.arg.add()
const_dim_arg.name = 'const_component_dim'
const_dim_arg.i = const_dim
const_dim_arg = op.arg.add()
const_dim_arg.name = 'const_component_dim'
const_dim_arg.i = const_dim
self.copy_node_attr(op, node,
'forward_const_indexes',
AttributeType.INTS)
self.copy_node_attr(op, node, 'subsample_factor',
AttributeType.INT, default=1)
self.copy_node_attr(op, node, 'forward_indexes',
AttributeType.INTS)
def convert_split(self, node):
op = self.convert_general_op(node)
......@@ -1516,6 +1438,12 @@ class OnnxConverter(base_converter.ConverterInterface):
axis_arg.name = MaceKeyword.mace_axis_str
axis_arg.ints.extend(axis_value)
def convert_subsample(self, node):
op = self.convert_general_op(node)
op.type = MaceOp.Subsample.name
self.copy_node_attr(op, node, 'forward_indexes',
AttributeType.INTS)
def convert_sum_group(self, node):
op = self.convert_general_op(node)
op.type = MaceOp.SumGroup.name
......@@ -1524,11 +1452,12 @@ class OnnxConverter(base_converter.ConverterInterface):
op = self.convert_general_op(node)
op.type = MaceOp.TargetRMSNorm.name
self.copy_node_attr(op, node, 'target_rms', AttributeType.FLOAT)
self.copy_node_attr(op, node, 'add_log_stddev', AttributeType.INT,
default=0)
self.copy_node_attr(op, node, 'block_dim', AttributeType.INT,
default=0)
self.copy_node_attr(op, node, 'target_rms',
AttributeType.FLOAT)
self.copy_node_attr(op, node, 'add_log_stddev',
AttributeType.INT, default=0)
self.copy_node_attr(op, node, 'block_dim',
AttributeType.INT, default=0)
def convert_transpose(self, node):
op = self.convert_general_op(node)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册