Merge branch 'support-ivector-input' into 'master'

support kaldi ivector input and subsample See merge request !1210

Merge branch 'support-ivector-input' into 'master'
support kaldi ivector input and subsample See merge request !1210
8bc14517 · 叶剑武 · 472640ca · d70c7295 · 8bc14517 · 8bc14517
13 changed file
--- a/mace/ops/dynamic_lstm.cc
+++ b/mace/ops/dynamic_lstm.cc
@@ -14,6 +14,21 @@

 // This Op is for Fused-LstmNonlinearityComponent
 // with prev cell states as inputs in Kaldi.
+// prev_out_delay: is the IfDefined componnet's delay value.
+//                 means which previous frame's output will
+//                 be used here as an input.
+// prev_cell_delay: similar as prev_out_delay.
+// prev_out_offset: output offset.
+// prev_out_dim: prev output's dim.
+// prev_cell_dim: prev cell's dim.
+// bias_a: the first affine's bias' flag, 1:has bias; 0:no bias.
+// bias_b: similar to bias_a.
+// scale: scale value of previous output and cell.
+// forward_indexes: contains the index of frames will be used for computaion.
+//                  This is pre-computed in kaldi-onnx converter
+// cell_cache_indexes: indicates which frame's cell will be cached for next
+//                     computation.
+// out_cache_indexes: similar to cell_cache_indexes.
 // http://kaldi-asr.org/doc/nnet-combined-component_8h_source.html#l00255
 // More details are in docs/development/dynamic_lstm.md

@@ -50,7 +65,44 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
        prev_cell_dim_(Operation::GetOptionalArg<int>("prev_cell_dim", 0)),
        has_bias_a_(Operation::GetOptionalArg<int>("bias_a", 1)),
        has_bias_b_(Operation::GetOptionalArg<int>("bias_b", 1)),
-        scale_(Operation::GetOptionalArg<float>("scale", 1.0f)) {}
+        scale_(Operation::GetOptionalArg<float>("scale", 1.0f)),
+        subsample_factor_(
+            Operation::GetOptionalArg<int>("subsample_factor", 1)),
+        forward_indexes_(
+            Operation::GetRepeatedArgs<index_t>("forward_indexes")),
+        cell_cache_indexes_(
+            Operation::GetRepeatedArgs<index_t>("cell_cache_indexes")),
+        out_cache_indexes_(
+            Operation::GetRepeatedArgs<index_t>("out_cache_indexes")) {}
+
+  inline void Validate() {
+    const Tensor *input = this->Input(0);
+    const unsigned int rank = static_cast<unsigned int>(input->dim_size());
+    MACE_CHECK(rank >= 2, "DynamicLSTM's input should have at least 2 dims.");
+    const index_t input_chunk = input->dim(rank - 2);
+    for (size_t i = 0; i < forward_indexes_.size(); ++i) {
+      MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0,
+                 "index is over range.");
+    }
+
+    MACE_CHECK(this->InputSize() >= 6,
+               "DynamicLSTM should have at least six inputs.",
+               "But has only ", this->InputSize(), " inputs.");
+    MACE_CHECK(prev_cell_delay_ < 0 && prev_out_delay_ < 0,
+               "prev_cell_delay(", prev_cell_delay_,
+               ") and prev_out_delay(", prev_out_delay_,
+               ") should be less than zero.");
+    MACE_CHECK(prev_cell_delay_ % subsample_factor_ == 0 &&
+               prev_out_delay_ % subsample_factor_ == 0,
+               "prev_cell_delay(", prev_cell_delay_,
+               ") and prev_out_delay(", prev_out_delay_,
+               ") should be multiples of subsample_factor(",
+               subsample_factor_, ").");
+    MACE_CHECK(prev_out_dim_ > 0 && prev_cell_dim_ > 0,
+               "prev_out_dim(", prev_out_dim_,
+               ") and prev_cell_dim(", prev_cell_dim_,
+               ") should be greater than zero.");
+  }

  void UpdateCell(float *cell_data,
                  const index_t cell_dim,
@@ -65,7 +117,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
      in_vec = vmulq_f32(in_vec, scale_vec);
      vst1q_f32(cell_data + i, in_vec);
 #else
-      for (int j = 0; j < 4; ++j) {
+      for (index_t j = 0; j < 4; ++j) {
        cell_data[i + j] *= scale;
      }
 #endif
@@ -92,7 +144,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
      in_vec = vmulq_f32(in_vec, scale_vec);
      vst1q_f32(cell_data + i, in_vec);
 #else
-      for (int j = 0; j < 4; ++j) {
+      for (index_t j = 0; j < 4; ++j) {
        cell_data[i + j] = src_data[i + j] * scale;
      }
 #endif
@@ -104,32 +156,26 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
-    int max_input_num = 4;
-    MACE_CHECK(this->InputSize() >= max_input_num,
-               "DynamicLSTM has at least four inputs.");
-    MACE_CHECK(prev_cell_delay_ < 0 && prev_out_delay_ < 0);
-    MACE_CHECK(prev_out_dim_ > 0 && prev_cell_dim_ > 0);
+    Validate();
    const Tensor *input = this->Input(INPUT);
+    const Tensor *prev_out = this->Input(PREV_OUT);
+    const Tensor *prev_cell = this->Input(PREV_CELL);
    const Tensor *weights_a = this->Input(WEIGHTS_A);
    const Tensor *lstm_params = this->Input(PARAMS);
    const Tensor *weights_b = this->Input(WEIGHTS_B);
-    if (has_bias_a_) {
-      max_input_num++;
-      MACE_CHECK(this->InputSize() >= max_input_num,
-                 "The first affine needs a bias input.");
-    }
+    int max_input_num = 6;
+    max_input_num = has_bias_a_ ? max_input_num + 1 : max_input_num;
+    MACE_CHECK(this->InputSize() >= max_input_num,
+               "The first affine needs a bias input.");
    const Tensor *bias_a = has_bias_a_ ?
                           this->Input(max_input_num - 1) :
                           nullptr;
-    if (has_bias_b_) {
-      max_input_num++;
-      MACE_CHECK(this->InputSize() >= max_input_num,
-                 "The second affine needs a bias input.");
-    }
+    max_input_num = has_bias_b_ ? max_input_num + 1 : max_input_num;
+    MACE_CHECK(this->InputSize() >= max_input_num,
+               "The second affine needs a bias input.");
    const Tensor *bias_b = has_bias_b_ ?
                           this->Input(max_input_num - 1) :
                           nullptr;
-
    const index_t input_rank = input->dim_size();
    MACE_CHECK(input_rank >= 2,
               "Dynamic LSTM Cell's input dim size should be >= 2.");
@@ -150,12 +196,15 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
    const index_t lstm_input_dim = affine_a_out_dim + prev_cell_dim_;
    const index_t lstm_cell_dim = lstm_input_dim / 5;
    const index_t params_stride = lstm_params->dim(1);
-    MACE_CHECK(lstm_input_dim == (lstm_cell_dim * 5));
+    MACE_CHECK(lstm_input_dim == (lstm_cell_dim * 5),
+               "lstm_input_dim(", lstm_input_dim,
+               ") should be 5 times of lstm_cell_dim(",
+               lstm_cell_dim, ").");
    MACE_CHECK(lstm_params->dim(0) == 3 &&
        params_stride == lstm_cell_dim && lstm_cell_dim == prev_cell_dim_)
-      << "lstm params rows:" << lstm_params->dim(0)
-      << "params_stride:" << params_stride
-      << "!=" << "cell_dim:" << lstm_cell_dim << std::endl;
+      << " lstm params rows: " << lstm_params->dim(0)
+      << " params_stride: " << params_stride
+      << " != " << " cell_dim: " << lstm_cell_dim << std::endl;
    const index_t affine_b_out_dim = weights_b->dim(0);
    const index_t affine_b_depth = weights_b->dim(1);
    const index_t affine_b_in_dim = lstm_cell_dim;
@@ -164,7 +213,10 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
      << "!=" << "affine_b's weights' depth:" << affine_b_depth << std::endl;

    const index_t output_dim = affine_b_out_dim;
-    MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim);
+    MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim)
+        << " prev_out_offset: " << prev_out_offset_
+        << " prev_out_dim: " << prev_out_dim_
+        << " output_dim: " << output_dim;

    const index_t affine_a_in_size =
        PadAlignSize(affine_a_in_dim * sizeof(float));
@@ -175,8 +227,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
    const index_t affine_b_out_size =
        PadAlignSize(affine_b_out_dim * sizeof(float));

-    const int out_buf_chunk = abs(prev_out_delay_);
-    const int cell_buf_chunk = abs(prev_cell_delay_);
+    const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_);
+    const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_);
    const index_t out_buf_size =
        PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float));
    const index_t cell_buf_size =
@@ -187,13 +239,13 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
                          + affine_b_in_size + affine_b_out_size
                          + out_buf_size + cell_buf_size);

-    Tensor prev_out(scratch->Scratch(out_buf_size), DT_FLOAT);
-    prev_out.Reshape({out_buf_chunk, prev_out_dim_});
-    float *prev_out_data = prev_out.mutable_data<float>();
+    Tensor prev_out_buf(scratch->Scratch(out_buf_size), DT_FLOAT);
+    prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_});
+    float *prev_out_buf_data = prev_out_buf.mutable_data<float>();

-    Tensor prev_cell(scratch->Scratch(cell_buf_size), DT_FLOAT);
-    prev_cell.Reshape({cell_buf_chunk, prev_cell_dim_});
-    float *prev_cell_data = prev_cell.mutable_data<float>();
+    Tensor prev_cell_buf(scratch->Scratch(cell_buf_size), DT_FLOAT);
+    prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_});
+    float *prev_cell_buf_data = prev_cell_buf.mutable_data<float>();

    Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT);
    affine_a_in.Reshape({1, affine_a_in_dim});
@@ -212,38 +264,57 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
    float *affine_b_out_data = affine_b_out.mutable_data<float>();

    Tensor *output = this->Output(OUTPUT);
+    Tensor *out_cache = this->Output(OUT_CACHE);
+    Tensor *cell_cache = this->Output(CELL_CACHE);

    std::vector<index_t> output_shape = input->shape();
+    const index_t out_chunk = forward_indexes_.size();
    output_shape[input_rank - 1] = output_dim;
+    std::vector<index_t> prev_out_shape = input->shape();
+    prev_out_shape[input_rank - 1] = prev_out_dim_;
+    prev_out_shape[input_rank - 2] = out_buf_chunk;
+    std::vector<index_t> prev_cell_shape = input->shape();
+    prev_cell_shape[input_rank - 1] = prev_cell_dim_;
+    prev_cell_shape[input_rank - 2] = cell_buf_chunk;

    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    MACE_RETURN_IF_ERROR(out_cache->Resize(prev_out_shape));
+    MACE_RETURN_IF_ERROR(cell_cache->Resize(prev_cell_shape));

    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard prev_out_guard(prev_out);
+    Tensor::MappingGuard prev_cell_guard(prev_cell);
    Tensor::MappingGuard lstm_params_guard(lstm_params);
+
    Tensor::MappingGuard output_guard(output);
+    Tensor::MappingGuard out_cache_guard(out_cache);
+    Tensor::MappingGuard cell_cache_guard(cell_cache);
+
    const float *input_data = input->data<float>();
+    const float *prev_out_data = prev_out->data<float>();
+    const float *prev_cell_data = prev_cell->data<float>();
    const float *lstm_params_data = lstm_params->data<float>();
    float *output_data = output->mutable_data<float>();
+    float *out_cache_data = out_cache->mutable_data<float>();
+    float *cell_cache_data = cell_cache->mutable_data<float>();

    for (int b = 0; b < batch; ++b) {
-      int prev_out_idx = prev_out_delay_;
-      int prev_cell_idx = prev_cell_delay_;
-      prev_cell.Clear();
-      prev_out.Clear();
-      affine_a_in.Clear();
-      affine_a_out.Clear();
-      affine_b_in.Clear();
-      affine_b_out.Clear();
-      for (int i = 0; i < chunk; ++i) {
-        const float *input_ptr = input_data + (b * chunk + i) * input_dim;
-        float *output_ptr = output_data + (b * chunk + i) * output_dim;
+      memcpy(prev_out_buf_data,
+             prev_out_data + b * out_buf_chunk * prev_out_dim_,
+             sizeof(float) * out_buf_chunk * prev_out_dim_);
+      memcpy(prev_cell_buf_data,
+             prev_cell_data + b * cell_buf_chunk * prev_cell_dim_,
+             sizeof(float) * cell_buf_chunk * prev_cell_dim_);
+
+      for (index_t i = 0; i < out_chunk; ++i) {
+        const float *input_ptr =
+            input_data + (b * chunk + forward_indexes_[i]) * input_dim;
+        float *output_ptr = output_data + (b * out_chunk + i) * output_dim;
        // Append
        memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(float));
-        if (prev_out_idx >= 0) {
-          memcpy(affine_a_in_data + input_dim,
-                 prev_out_data + prev_out_idx % out_buf_chunk * prev_out_dim_,
-                 prev_out_dim_ * sizeof(float));
-        }
+        memcpy(affine_a_in_data + input_dim,
+               prev_out_buf_data + i % out_buf_chunk * prev_out_dim_,
+               prev_out_dim_ * sizeof(float));
        // Affine
        gemv_.Compute(context,
                      weights_a,
@@ -256,15 +327,13 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
                      false,
                      &affine_a_out);
        // Prepare LSTMNonlinear input and output pointer
-        float *prev_cell_ptr =
-            prev_cell_idx < 0 ? nullptr :
-            prev_cell_data + prev_cell_idx % cell_buf_chunk * prev_cell_dim_;
-        float *curr_cell_ptr =
-            prev_cell_data + i % cell_buf_chunk * prev_cell_dim_;
+        float *lstm_cell_ptr =
+            prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_;
+        float *curr_cell_ptr = lstm_cell_ptr;
        // LSTMNonlinear
        LSTMNonlinearKernel(context,
                            affine_a_out_data,
-                            prev_cell_ptr,
+                            lstm_cell_ptr,
                            nullptr,
                            lstm_params_data,
                            false,
@@ -289,16 +358,36 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
               affine_b_out_data,
               output_dim * sizeof(float));
        // Update
-        float *curr_out_ptr = prev_out_data + i % out_buf_chunk * prev_out_dim_;
+        float *curr_out_ptr =
+            prev_out_buf_data + i % out_buf_chunk * prev_out_dim_;
        CopyAndUpdateCell(affine_b_out_data + prev_out_offset_,
                          prev_out_dim_,
                          scale_,
                          curr_out_ptr);
-        prev_out_idx++;
-        prev_cell_idx++;
+
+        for (size_t k = 0; k < out_cache_indexes_.size(); ++k) {
+          if (i == out_cache_indexes_[k]) {
+            const index_t idx = b * out_buf_chunk + k;
+            float *out_cache_ptr =
+                out_cache_data + idx * prev_out_dim_;
+            memcpy(out_cache_ptr,
+                   curr_out_ptr,
+                   sizeof(float) * prev_out_dim_);
+          }
+        }
+
+        for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) {
+          if (i == cell_cache_indexes_[k]) {
+            const index_t idx = b * cell_buf_chunk + k;
+            float *cell_cache_ptr =
+                cell_cache_data + idx * prev_cell_dim_;
+            memcpy(cell_cache_ptr,
+                   curr_cell_ptr,
+                   sizeof(float) * prev_cell_dim_);
+          }
+        }
      }
    }
-
    return MaceStatus::MACE_SUCCESS;
  }

@@ -311,6 +400,10 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
  int has_bias_a_;
  int has_bias_b_;
  float scale_;
+  int subsample_factor_;
+  std::vector<index_t> forward_indexes_;
+  std::vector<index_t> cell_cache_indexes_;
+  std::vector<index_t> out_cache_indexes_;

 #ifdef MACE_ENABLE_NEON
  arm::fp32::Gemv gemv_;
@@ -318,8 +411,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
  ref::Gemv<float> gemv_;
 #endif  // MACE_ENABLE_NEON

-  MACE_OP_INPUT_TAGS(INPUT, WEIGHTS_A, PARAMS, WEIGHTS_B);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
+  MACE_OP_INPUT_TAGS(INPUT, PREV_OUT, PREV_CELL, WEIGHTS_A, PARAMS, WEIGHTS_B);
+  MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_CACHE, CELL_CACHE);
 };

 void RegisterDynamicLSTM(OpRegistryBase *op_registry) {

--- a/mace/ops/extract_pooling.cc
+++ b/mace/ops/extract_pooling.cc
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// This Op is for fused StatisticsExtraction, StatisticsPooling and
-// Round Components in Kaldi.
+// This Op is for fused StatisticsExtraction and StatisticsPooling
+// Components in Kaldi.
 // This op is used to extract moving-average mean and standard-deviation
 // statistics of input data.
-// 'input_indexes' indicates which frames will be used for extract statistics.
-// 'output_indexes' indicates which frames  of outputs will be used to
+// 'forward_indexes' indicates which frames of input will be used for
+// extraction.
 // save statistics results.
-// 'modulus' will be used for extent results to all frames.
-// 'start_index' and 'end_index' indicate time indexes of output frames.
 // 'forward_indexes' and 'count' were from precomputed index in kaldi.
-// Reference to
+// Reference to tools/extract_pooling.py and
 // http://kaldi-asr.org/doc/nnet-general-component_8h_source.html#l00158

 #include <functional>
@@ -42,7 +40,6 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit ExtractPoolingOp(OpConstructContext *context)
      : Operation(context),
-        modulus_(Operation::GetOptionalArg<int>("modulus", 1)),
        include_variance_(
            static_cast<bool>(
                Operation::GetOptionalArg<int>("include_variance", 0))),
@@ -50,39 +47,36 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
            Operation::GetOptionalArg<int>("num_log_count", 0)),
        variance_floor_(
            Operation::GetOptionalArg<float>("variance_floor", 1.0e-10)),
-        input_indexes_(Operation::GetRepeatedArgs<int>("input_indexes")),
-        output_indexes_(Operation::GetRepeatedArgs<int>("output_indexes")),
        forward_indexes_(Operation::GetRepeatedArgs<int>("forward_indexes")),
-        counts_(Operation::GetRepeatedArgs<float>("counts")),
-        input_time_range_(Operation::GetRepeatedArgs<int>("input_time_range")),
-        output_time_range_(
-            Operation::GetRepeatedArgs<int>("output_time_range")) {}
+        counts_(Operation::GetRepeatedArgs<float>("counts")) {}
+
+  inline void Validate() {
+    const Tensor *input = this->Input(0);
+    const unsigned int rank = static_cast<unsigned int>(input->dim_size());
+    MACE_CHECK(rank >= 2,
+               "ExtractPooling only supports input dim size >= 2");
+    MACE_CHECK(counts_.size() * 2 == forward_indexes_.size(),
+               "counts length(", counts_.size(),
+               ") should be 2 times of forward_indexes length(",
+               forward_indexes_.size(), ").");
+    for (size_t i = 0; i < counts_.size(); ++i) {
+      MACE_CHECK(static_cast<index_t>(counts_[i]) ==
+                     forward_indexes_[2 * i + 1] - forward_indexes_[2 * i],
+                 "invalid forward indexes and counts values");
+    }
+  }

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);
-
+    Validate();
    const std::vector<index_t> &input_shape = input->shape();
-    const index_t dim_size = input_shape.size();
-    MACE_CHECK(dim_size >= 2,
-               "ExtractPooling only supports input dim size >= 2");
-    MACE_CHECK(modulus_ >= 1,
-               "ExtractPooling's pooling size should be greater than zero.");
-    MACE_CHECK(input_time_range_.size() == 2 && output_time_range_.size() == 2
-                   && counts_.size() * 2 == forward_indexes_.size()
-                   && counts_.size() == output_indexes_.size());
-    int in_start_index = input_time_range_[0];
-    int out_start_index = output_time_range_[0];
-    int out_end_index = output_time_range_[1];
-    MACE_CHECK(out_end_index >= out_start_index
-                   && input_time_range_[1] >= input_time_range_[0],
-               "end index should be greater than start index.");
-    const index_t output_chunk = out_end_index - out_start_index + 1;
+    const unsigned int dim_size = static_cast<unsigned int>(input->dim_size());
+
    const index_t input_dim = input_shape[dim_size - 1];
    const index_t chunk = input_shape[dim_size - 2];
-    MACE_CHECK(chunk == input_time_range_[1] - input_time_range_[0] + 1,
-               "input chunk should be equal to end - start + 1.");
+    const index_t output_chunk = counts_.size();
    const index_t batch =
        std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
                        std::multiplies<index_t>());
@@ -94,10 +88,6 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
    output_shape[dim_size - 2] = output_chunk;
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));

-    const index_t num_input_indexes = input_indexes_.size();
-    const index_t num_output_indexes = output_indexes_.size();
-    MACE_CHECK(num_input_indexes > 0 && num_output_indexes > 0,
-               "ExtractPooling's input_indexes or output_indexes is empty.");
    const index_t extract_out_size = PadAlignSize(output_dim * sizeof(float));
    ScratchBuffer *scratch = context->device()->scratch_buffer();
    scratch->Rewind();
@@ -117,7 +107,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
        &thread_pool = context->device()->cpu_runtime()->thread_pool();

    for (index_t b = 0; b < batch; ++b) {
-      for (index_t i = 0; i < num_output_indexes; ++i) {
+      for (index_t i = 0; i < output_chunk; ++i) {
        int start = forward_indexes_[2 * i];
        int end = forward_indexes_[2 * i + 1];
        float count = counts_[i];
@@ -139,7 +129,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
              float variance = 0.f;
              for (int t = start; t < end; ++t) {
                index_t input_index =
-                    (b * chunk + input_indexes_[t] - in_start_index)
+                    (b * chunk + t)
                        * input_dim;
                float x = input_data[input_index + d];
                mean += x;
@@ -163,30 +153,15 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
              float mean = 0.f;
              for (int t = start; t < end; ++t) {
                index_t input_index =
-                    (b * chunk + input_indexes_[t] - in_start_index)
-                        * input_dim;
+                    (b * chunk + t) * input_dim;
                mean += input_data[input_index + d];
              }
              extract_out_data[d + num_log_count_] = mean * mean_scale;
            }
          }, 0, input_dim, 1);
        }
-
-        int output_start = output_indexes_[i] < out_start_index ?
-                           out_start_index : output_indexes_[i];
-        int output_end = output_indexes_[i] + modulus_;
-        output_end = output_end > out_end_index ?
-                     out_end_index + 1 :
-                     output_end;
-        thread_pool.Compute1D([=](index_t start0,
-                                  index_t end0,
-                                  index_t step0) {
-          for (index_t idx = start0; idx < end0; idx += step0) {
-            memcpy(output_data + (b * output_chunk + idx - out_start_index)
-                       * output_dim,
-                   extract_out_data, output_dim * sizeof(float));
-          }
-        }, output_start, output_end, 1);
+        memcpy(output_data + (b * output_chunk + i) * output_dim,
+               extract_out_data, output_dim * sizeof(float));
      }
    }

@@ -194,16 +169,11 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
  }

 private:
-  int modulus_;
  bool include_variance_;
  int num_log_count_;
  float variance_floor_;
-  std::vector<int> input_indexes_;
-  std::vector<int> output_indexes_;
  std::vector<int> forward_indexes_;
  std::vector<float> counts_;
-  std::vector<int> input_time_range_;
-  std::vector<int> output_time_range_;
 };

 void RegisterExtractPooling(OpRegistryBase *op_registry) {

--- a/mace/ops/ifdefined.cc
+++ b/mace/ops/ifdefined.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This Op is for IfDefined descriptor in Kaldi.
+// It defines time offset.
+// If time index <= offset, using zeros as output.
+// forward_indexes: indicates which frames will be used for computation.
+//                  Because of the model's subsampling, this is pre-computed
+//                  in kaldi-onnx.
+// cache_forward_indexes: indicates which frames of cached previous output
+//                        will be used here. If there is only one input,
+//                        this parameter will be empty.
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class IfDefinedOp;
+
+template <typename T>
+class IfDefinedOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit IfDefinedOp(OpConstructContext *context)
+      : Operation(context),
+        forward_indexes_(
+            Operation::GetRepeatedArgs<index_t>("forward_indexes")),
+        cache_forward_indexes_(
+            Operation::GetRepeatedArgs<index_t>("cache_forward_indexes")) {}
+
+  inline void Validate() {
+    MACE_CHECK(this->InputSize() <= 2,
+               "IfDefined Op should have at most 2 inputs.");
+    const Tensor *input = this->Input(INPUT);
+    const unsigned int rank = static_cast<unsigned int>(input->dim_size());
+    MACE_CHECK(rank >= 2, "IfDefined's input should have at least 2 dims.");
+    const index_t input_chunk = input->dim(rank - 2);
+    for (size_t i = 0; i < forward_indexes_.size(); ++i) {
+      MACE_CHECK(forward_indexes_[i] < input_chunk,
+                 "forward index is over range.");
+    }
+    for (size_t i = 0; i < cache_forward_indexes_.size(); ++i) {
+      MACE_CHECK(cache_forward_indexes_[i] < input_chunk &&
+                     cache_forward_indexes_[i] >= 0 ,
+                 "index is over range.");
+    }
+
+    if (this->InputSize() == 2) {
+      size_t cache_count = 0;
+      for (size_t i = 0; i < forward_indexes_.size(); ++i) {
+        if (forward_indexes_[i] < 0)
+          cache_count++;
+        else
+          break;
+      }
+      MACE_CHECK(cache_forward_indexes_.size() == cache_count,
+                 "IfDefined's cache forward index size:",
+                 cache_forward_indexes_.size(),
+                 " != forward indexes' negative part length:",
+                 cache_count);
+      for (size_t i = 0; i < cache_forward_indexes_.size(); ++i) {
+        MACE_CHECK(cache_forward_indexes_[i] < input_chunk &&
+          cache_forward_indexes_[i] >= 0,
+          "cache forward index is over range.");
+      }
+      const Tensor *cache_input = this->Input(CACHE_INPUT);
+      MACE_CHECK(cache_input->dim_size() == input->dim_size(),
+                 "two inputs should have the same rank");
+      for (unsigned int k = 0; k < rank; ++k) {
+        MACE_CHECK(input->dim(k) == cache_input->dim(k),
+                   "Two inputs should have the same shape");
+      }
+    }
+  }
+
+  void DelayCopy(OpContext *context,
+                 const T *input_data,
+                 const index_t batch,
+                 const index_t chunk,
+                 const index_t dim,
+                 const std::vector<index_t> &fwd_idxs,
+                 T *output_data) {
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      for (index_t i = start0; i < end0; i += step0) {
+        for (index_t j = start1; j < end1; j += step1) {
+          if (fwd_idxs[j] >= 0) {
+            memcpy(output_data + (i * chunk + j) * dim,
+                   input_data + (i * chunk + fwd_idxs[j]) * dim,
+                   dim * sizeof(T));
+          }
+        }
+      }
+    }, 0, batch, 1, 0, fwd_idxs.size(), 1);
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(INPUT);
+    Tensor *output = this->Output(OUTPUT);
+    Validate();
+    index_t rank = input->dim_size();
+    const std::vector<index_t> &input_shape = input->shape();
+    const index_t batch =
+        std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
+                        std::multiplies<index_t>());
+    const index_t chunk = input_shape[rank - 2];
+    const index_t dim = input_shape[rank - 1];
+    std::vector<index_t> output_shape(input->shape());
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    output->Clear();
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+    DelayCopy(context,
+              input_data,
+              batch,
+              chunk,
+              dim,
+              forward_indexes_,
+              output_data);
+
+    if (this->InputSize() == 2 && cache_forward_indexes_.size() > 0) {
+      const Tensor *cache_input = this->Input(CACHE_INPUT);
+      Tensor::MappingGuard cache_input_guard(cache_input);
+      const T *cache_input_data = cache_input->data<T>();
+      DelayCopy(context,
+                cache_input_data,
+                batch,
+                chunk,
+                dim,
+                cache_forward_indexes_,
+                output_data);
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  std::vector<index_t> forward_indexes_;
+  std::vector<index_t> cache_forward_indexes_;
+
+ private:
+  MACE_OP_INPUT_TAGS(INPUT, CACHE_INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+void RegisterIfDefined(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/registry/ops_registry.cc
+++ b/mace/ops/registry/ops_registry.cc
@@ -42,7 +42,7 @@ extern void RegisterFill(OpRegistryBase *op_registry);
 extern void RegisterFullyConnected(OpRegistryBase *op_registry);
 extern void RegisterGather(OpRegistryBase *op_registry);
 extern void RegisterIdentity(OpRegistryBase *op_registry);
-extern void RegisterDelay(OpRegistryBase *op_registry);
+extern void RegisterIfDefined(OpRegistryBase *op_registry);
 extern void RegisterInferConv2dShape(OpRegistryBase *op_registry);
 extern void RegisterKaldiBatchNorm(OpRegistryBase *op_registry);
 extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry);
@@ -56,6 +56,7 @@ extern void RegisterPadContext(OpRegistryBase *op_registry);
 extern void RegisterPNorm(OpRegistryBase *op_registry);
 extern void RegisterPooling(OpRegistryBase *op_registry);
 extern void RegisterReduce(OpRegistryBase *op_registry);
+extern void RegisterReplaceIndex(OpRegistryBase *op_registry);
 extern void RegisterPriorBox(OpRegistryBase *op_registry);
 extern void RegisterReshape(OpRegistryBase *op_registry);
 extern void RegisterResizeBicubic(OpRegistryBase *op_registry);
@@ -74,6 +75,7 @@ extern void RegisterSqrDiffMean(OpRegistryBase *op_registry);
 extern void RegisterSqueeze(OpRegistryBase *op_registry);
 extern void RegisterStack(OpRegistryBase *op_registry);
 extern void RegisterStridedSlice(OpRegistryBase *op_registry);
+extern void RegisterSubsample(OpRegistryBase *op_registry);
 extern void RegisterSumGroup(OpRegistryBase *op_registry);
 extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry);
 extern void RegisterTile(OpRegistryBase *op_registry);
@@ -119,7 +121,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
  ops::RegisterFullyConnected(this);
  ops::RegisterGather(this);
  ops::RegisterIdentity(this);
-  ops::RegisterDelay(this);
+  ops::RegisterIfDefined(this);
  ops::RegisterInferConv2dShape(this);
  ops::RegisterKaldiBatchNorm(this);
  ops::RegisterLocalResponseNorm(this);
@@ -133,6 +135,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
  ops::RegisterPNorm(this);
  ops::RegisterPooling(this);
  ops::RegisterReduce(this);
+  ops::RegisterReplaceIndex(this);
  ops::RegisterPriorBox(this);
  ops::RegisterReshape(this);
  ops::RegisterResizeBicubic(this);
@@ -151,6 +154,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
  ops::RegisterStridedSlice(this);
  ops::RegisterSqrDiffMean(this);
  ops::RegisterSqueeze(this);
+  ops::RegisterSubsample(this);
  ops::RegisterSumGroup(this);
  ops::RegisterTargetRMSNorm(this);
  ops::RegisterTile(this);

--- a/mace/ops/delay.cc
+++ b/mace/ops/delay.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// This Op is for IfDefined descriptor in Kaldi.
-// It defines time offset.
-// If time index <= offset, using zeros as output.
+// This Op is for ReplaceIndex in Kaldi.
+// Usually used for ivector inputs.
+// It copies ivector to each frame of the output.
+// forward_indexes: is the pre-computed indexes for output frames.

 #include <functional>
 #include <memory>
@@ -24,62 +25,77 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, typename T>
-class DelayOp;
+template<DeviceType D, typename T>
+class ReplaceIndexOp;

-template <typename T>
-class DelayOp<DeviceType::CPU, T> : public Operation {
+template<typename T>
+class ReplaceIndexOp<DeviceType::CPU, T> : public Operation {
 public:
-  explicit DelayOp(OpConstructContext *context)
+  explicit ReplaceIndexOp(OpConstructContext *context)
      : Operation(context),
-        offset_(Operation::GetOptionalArg<int>("offset", 0)) {}
+        forward_indexes_(
+            Operation::GetRepeatedArgs<index_t>("forward_indexes")) {}
+
+  inline void Validate() {
+    const Tensor *input = this->Input(0);
+    const unsigned int rank = static_cast<unsigned int>(input->dim_size());
+    MACE_CHECK(rank >= 2, "ReplaceIndex's input should have at least 2 dims.");
+
+    const index_t input_chunk = input->dim(rank - 2);
+    for (size_t i = 0; i < forward_indexes_.size(); ++i) {
+      MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0 ,
+                 "index is over range.");
+    }
+  }

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);
-    MACE_CHECK(offset_ < 0, "offset param should be negative.");
-
-    index_t rank = input->dim_size();
-    MACE_CHECK(rank >= 2, "input's rank should >= 2.");
+    Validate();
    const std::vector<index_t> &input_shape = input->shape();
    const index_t batch =
-        std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
+        std::accumulate(input->shape().begin(), input->shape().end() - 2, 1,
                        std::multiplies<index_t>());
-    const index_t chunk = input_shape[rank - 2];
+    const index_t rank = input->dim_size();
+    const index_t num_ivectors = input_shape[rank - 2];
    const index_t dim = input_shape[rank - 1];
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    output->Clear();
+    const index_t input_stride = num_ivectors * dim;
+
+    const index_t out_chunk = forward_indexes_.size();
+    const index_t output_stride = out_chunk * dim;

-    if (chunk <= -offset_)
-      return MaceStatus::MACE_SUCCESS;
+    std::vector<index_t> output_shape = input->shape();
+    output_shape[rank - 2] = out_chunk;
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));

    Tensor::MappingGuard input_guard(input);
    Tensor::MappingGuard output_guard(output);
    const T *input_data = input->data<T>();
    T *output_data = output->mutable_data<T>();
+
    utils::ThreadPool
        &thread_pool = context->device()->cpu_runtime()->thread_pool();
    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
-      for (index_t i = start0; i < end0; i += step0) {
-        for (index_t j = start1; j < end1; j += step1) {
-          memcpy(output_data + (i * chunk + j - offset_) * dim,
-                 input_data + (i * chunk + j) * dim,
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t i = start1; i < end1; i += step1) {
+          memcpy(output_data + b * output_stride + i * dim,
+                 input_data + b * input_stride + forward_indexes_[i] * dim,
                 dim * sizeof(T));
        }
      }
-    }, 0, batch, 1, 0, chunk + offset_, 1);
+    }, 0, batch, 1, 0, out_chunk, 1);

    return MaceStatus::MACE_SUCCESS;
  }

 private:
-  int offset_;
+  std::vector<index_t> forward_indexes_;
 };

-void RegisterDelay(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "Delay", DelayOp,
+void RegisterReplaceIndex(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
                   DeviceType::CPU, float);
 }


--- a/mace/ops/splice.cc
+++ b/mace/ops/splice.cc
@@ -22,6 +22,9 @@
 // if const_component_dim_ != 0, const_dim_ will be used to determine which
 // row of "in" we copy the last part of each row of "out" from (this part is
 // not subject to splicing, it's assumed constant for each frame of "input".
+// forward_indexes and forward_const_indexes indicate which frames will
+// be used for computation, and they are precomputed in kaldi-onnx converter
+// becase of supporting subsample.

 #include <functional>
 #include <memory>
@@ -40,21 +43,45 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit SpliceOp(OpConstructContext *context)
      : Operation(context),
-        context_(Operation::GetRepeatedArgs<int>("context")),
+        context_(Operation::GetRepeatedArgs<index_t>("context")),
        const_dim_(
-            Operation::GetOptionalArg<int>("const_component_dim", 0)) {}
+            Operation::GetOptionalArg<int>("const_component_dim", 0)),
+        forward_indexes_(
+            Operation::GetRepeatedArgs<index_t>("forward_indexes")),
+        forward_const_indexes_(
+            Operation::GetRepeatedArgs<index_t>("forward_const_indexes")) {}
+
+  inline void Validate() {
+    MACE_CHECK(context_.size() > 0)
+        << "The context param should not be empty in Splice Op.";
+    MACE_CHECK(forward_indexes_.size() % context_.size() == 0,
+               "Splice's forward indexes should be multiply of num splice.");
+    const Tensor *input = this->Input(0);
+    const unsigned int rank = static_cast<unsigned int>(input->dim_size());
+    MACE_CHECK(rank >= 2, "Splice's input should have at least 2 dims.");
+    MACE_CHECK(input->dim(rank - 1) > const_dim_,
+               "input dim:", input->dim(rank - 1),
+               "should be greater than const dim:", const_dim_);
+
+    const index_t input_chunk = input->dim(rank - 2);
+    for (size_t i = 0; i < forward_indexes_.size(); ++i) {
+      MACE_CHECK(forward_indexes_[i] < input_chunk && forward_indexes_[i] >= 0)
+          << " forward index:" << forward_indexes_[i] << " input shape:"
+          << input->dim(0) << "," << input->dim(1) << "," << input->dim(2);
+    }
+    for (size_t i = 0; i < forward_const_indexes_.size(); ++i) {
+      MACE_CHECK(forward_const_indexes_[i] < input_chunk &&
+                     forward_const_indexes_[i] >= 0 ,
+                 "index is over range.");
+    }
+  }

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
    const Tensor *input = this->Input(0);
-    MACE_CHECK(context_.size() > 0)
-      << "The context param should not be empty in Splice Op.";
-    MACE_CHECK(input->dim_size() >= 2)
-      << "Splice's input's rank should be greater than 2.";
-
    Tensor *output = this->Output(0);
+    Validate();
    const std::vector<index_t> &input_shape = input->shape();
-
    const index_t batch =
        std::accumulate(input->shape().begin(), input->shape().end() - 2, 1,
                        std::multiplies<index_t>());
@@ -65,14 +92,10 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {

    const index_t num_splice = static_cast<index_t>(context_.size());
    const index_t dim = input_dim - const_dim_;
-    const index_t left_context = context_[0];
-    const index_t right_context = context_[num_splice -1];
-
-    const index_t out_chunk = chunk - (right_context - left_context);
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();

-    MACE_CHECK(input_dim > const_dim_,
-               "input dim:", input_dim,
-               "should be greater than const dim:", const_dim_);
+    const index_t out_chunk = forward_indexes_.size() / num_splice;
    const index_t output_dim = dim * num_splice + const_dim_;
    const index_t output_stride = out_chunk * output_dim;

@@ -86,38 +109,48 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
    const T *input_data = input->data<T>();
    T *output_data = output->mutable_data<T>();

-    for (int b = 0; b < batch; ++b) {
-      for (index_t i = 0; i < out_chunk; ++i) {
-        for (index_t c = 0; c < num_splice; ++c) {
-          const index_t offset = i + context_[c] - left_context;
-          T *output_base =
-              output_data + b * output_stride + i * output_dim + c * dim;
-          const T *input_base =
-              input_data + b * input_stride + offset * input_dim;
-          memcpy(output_base, input_base, dim * sizeof(T));
+    thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t i = start1; i < end1; i += step1) {
+          for (index_t c = start2; c < end2; c += step2) {
+            const index_t pos = forward_indexes_[i * num_splice + c];
+            T *output_base =
+                output_data + b * output_stride + i * output_dim + c * dim;
+            const T *input_base =
+                input_data + b * input_stride + pos * input_dim;
+            memcpy(output_base, input_base, dim * sizeof(T));
+          }
        }
      }
-    }
+    }, 0, batch, 1, 0, out_chunk, 1, 0, num_splice, 1);

    if (const_dim_ > 0) {
      const index_t output_offset = output_dim - const_dim_;
-      const index_t input_offset = dim;
-      for (int b = 0; b < batch; ++b) {
-        for (index_t i = 0; i < out_chunk; ++i) {
-          T *output_base = output_data + b * output_stride + i * output_dim;
-          const T *input_base = input_data + b * input_stride + i * input_dim;
-          memcpy(output_base + output_offset,
-                 input_base + input_offset,
-                 const_dim_ * sizeof(T));
+      thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                index_t start1, index_t end1, index_t step1) {
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
+            T *output_base = output_data + b * output_stride +
+                i * output_dim + output_offset;
+            const T *input_base =
+                input_data + b * input_stride +
+                forward_const_indexes_[i] * input_dim + dim;
+            memcpy(output_base, input_base,
+                   const_dim_ * sizeof(T));
+          }
        }
-      }
+      }, 0, batch, 1, 0, out_chunk, 1);
    }
    return MaceStatus::MACE_SUCCESS;
  }

 private:
-  std::vector<int> context_;
+  std::vector<index_t> context_;
  int const_dim_;
+  std::vector<index_t> forward_indexes_;
+  std::vector<index_t> forward_const_indexes_;
 };

 void RegisterSplice(OpRegistryBase *op_registry) {

--- a/mace/ops/subsample.cc
+++ b/mace/ops/subsample.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This Op is created for subsample frames for Kaldi model's inference.
+// forward_indexes: indicates which frames will be selected as output.
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+#include "mace/utils/math.h"
+
+namespace mace {
+namespace ops {
+
+template<DeviceType D, typename T>
+class SubsampleOp;
+
+template<typename T>
+class SubsampleOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit SubsampleOp(OpConstructContext *context)
+      : Operation(context),
+        forward_indexes_(
+            Operation::GetRepeatedArgs<index_t>("forward_indexes")) {}
+
+  inline void Validate() {
+      const Tensor *input = this->Input(0);
+      const unsigned int rank =
+          static_cast<unsigned int>(input->dim_size());
+      MACE_CHECK(rank >= 2,
+                 "Subsample's input should have at least 2 dims.");
+
+      const index_t input_chunk = input->dim(rank - 2);
+      for (size_t i = 0; i < forward_indexes_.size(); ++i) {
+          MACE_CHECK(forward_indexes_[i] < input_chunk &&
+                         forward_indexes_[i] >= 0 ,
+                     "index is over range.");
+      }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    Validate();
+    const std::vector<index_t> &input_shape = input->shape();
+
+    const index_t batch =
+        std::accumulate(input->shape().begin(),
+                        input->shape().end() - 2, 1,
+                        std::multiplies<index_t>());
+    const index_t rank = input->dim_size();
+    const index_t chunk = input_shape[rank - 2];
+    const index_t dim = input_shape[rank - 1];
+    const index_t input_stride = chunk * dim;
+    const index_t out_chunk = forward_indexes_.size();
+
+    const index_t output_stride = out_chunk * dim;
+
+    std::vector<index_t> output_shape = input->shape();
+    output_shape[rank - 2] = out_chunk;
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t i = start1; i < end1; i += step1) {
+          T *output_base =
+              output_data + b * output_stride + i * dim;
+          const T *input_base =
+              input_data + b * input_stride + forward_indexes_[i] * dim;
+          memcpy(output_base, input_base, dim * sizeof(T));
+        }
+      }
+    }, 0, batch, 1, 0, out_chunk, 1);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  std::vector<index_t> forward_indexes_;
+};
+
+void RegisterSubsample(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/target_rms_norm.cc
+++ b/mace/ops/target_rms_norm.cc
@@ -71,7 +71,6 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
    return result;
  }

-
  void NormalizePerRow(const float *data,
                       const index_t data_len,
                       float d_scale,
@@ -105,9 +104,9 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
                        std::multiplies<index_t>());
    if (block_dim_ == 0) block_dim_ = static_cast<int>(input_dim);
    MACE_CHECK(input_dim % block_dim_ == 0, "block_dim must divide input_dim!");
-    const index_t output_dim = add_log_stddev_ ?
+    const index_t output_dim = add_log_stddev_ > 0 ?
                               input_dim + (input_dim / block_dim_) : input_dim;
-    std::vector<index_t> output_shape = input->shape();
+    std::vector<index_t> output_shape(input_shape);
    output_shape[dim_size - 1] = output_dim;
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));

@@ -140,7 +139,6 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
      }
    }, 0, num_rows, 1);

-
    return MaceStatus::MACE_SUCCESS;
  }


--- a/test/ccunit/mace/ops/extract_pooling_test.cc
+++ b/test/ccunit/mace/ops/extract_pooling_test.cc
@@ -28,12 +28,8 @@ void TestExtractPooling(const std::vector<index_t> &input_shape,
                        const int modulus,
                        const int num_log_count,
                        const int include_variance,
-                        const std::vector<int> &input_time_range,
-                        const std::vector<int> &input_indexes,
                        const std::vector<int> &forward_indexes,
                        const std::vector<float> &counts,
-                        const std::vector<int> &output_indexes,
-                        const std::vector<int> &output_time_range,
                        const std::vector<index_t> &output_shape,
                        const std::vector<float> &output_value) {
  // Construct graph
@@ -44,12 +40,8 @@ void TestExtractPooling(const std::vector<index_t> &input_shape,
      .AddIntArg("modulus", modulus)
      .AddIntArg("include_variance", include_variance)
      .AddIntArg("num_log_count", num_log_count)
-      .AddIntsArg("input_indexes", input_indexes)
-      .AddIntsArg("output_indexes", output_indexes)
      .AddIntsArg("forward_indexes", forward_indexes)
      .AddFloatsArg("counts", counts)
-      .AddIntsArg("input_time_range", input_time_range)
-      .AddIntsArg("output_time_range", output_time_range)
      .Output("Output")
      .Finalize(net.NewOperatorDef());
  // Run
@@ -63,123 +55,52 @@ void TestExtractPooling(const std::vector<index_t> &input_shape,
 TEST_F(ExtractPoolingTest, SimpleCPU) {
  TestExtractPooling<DeviceType::CPU, float>(
    {3, 20, 3},
-    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
     31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
     46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-     31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-     46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-     31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-     46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60},
+     61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+     76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+     91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+     106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+     121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
+     136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
+     151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
+     166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179},
    9, 0, 0,
-    {-2, 17},
-    {0, 3, 6, 9, 12, 15},
    {0, 6, 2, 6},
    {6, 4},
-    {0, 9},
-    {0, 17},
-    {3, 18, 3},
-    {29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     29.5, 30.5, 31.5, 29.5, 30.5, 31.5, 29.5, 30.5, 31.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5,
-     38.5, 39.5, 40.5, 38.5, 39.5, 40.5, 38.5, 39.5, 40.5});
+    {3, 2, 3},
+    {7.5, 8.5, 9.5, 10.5, 11.5, 12.5,
+     67.5, 68.5, 69.5, 70.5, 71.5, 72.5,
+     127.5, 128.5, 129.5, 130.5, 131.5, 132.5});
 }

 TEST_F(ExtractPoolingTest, SimpleCPUWithVariance) {
 TestExtractPooling<DeviceType::CPU, float>(
    {3, 20, 3},
-    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
     31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
     46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-     31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-     46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-     31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-     46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60},
+     61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+     76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+     91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+     106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+     121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
+     136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
+     151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
+     166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179},
     9, 1, 1,
-    {-2, 17},
-    {0, 3, 6, 9, 12, 15},
    {0, 6, 2, 6},
    {6, 4},
-    {0, 9},
-    {0, 17},
-    {3, 18, 7},
-    {1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.79176, 29.5, 30.5, 31.5, 15.3704, 15.3704, 15.3704,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623,
-     1.386294, 38.5, 39.5, 40.5, 10.0623, 10.0623, 10.0623});
+    {3, 2, 7},
+    {1.7917595, 7.5, 8.5, 9.5, 5.1234756, 5.1234756, 5.1234756,
+     1.3862944, 10.5, 11.5, 12.5, 3.354102, 3.354102, 3.354102,
+     1.7917595, 67.5, 68.5, 69.5, 5.1234756, 5.1234756, 5.1234756,
+     1.3862944, 70.5, 71.5, 72.5, 3.354102, 3.354102, 3.354102,
+     1.7917595, 127.5, 128.5, 129.5, 5.1234756, 5.1234756, 5.1234756,
+     1.3862944, 130.5, 131.5, 132.5, 3.354102, 3.354102, 3.354102});
 }

 }  // namespace test

--- a/test/ccunit/mace/ops/splice_test.cc
+++ b/test/ccunit/mace/ops/splice_test.cc
@@ -26,6 +26,8 @@ void TestSplice(const std::vector<index_t> &input_shape,
                const std::vector<T> &input,
                const std::vector<int> &context,
                const int const_dim,
+                const std::vector<int> &forward_indexes,
+                const std::vector<int> &forward_const_indexes,
                const std::vector<index_t> &output_shape,
                const std::vector<T> &output) {
  OpsTestNet net;
@@ -38,6 +40,8 @@ void TestSplice(const std::vector<index_t> &input_shape,
      .Output("Output")
      .AddIntsArg("context", context)
      .AddIntArg("const_component_dim", const_dim)
+      .AddIntsArg("forward_indexes", forward_indexes)
+      .AddIntsArg("forward_const_indexes", forward_const_indexes)
      .Finalize(net.NewOperatorDef());

  net.RunOp();
@@ -53,6 +57,8 @@ TEST_F(SpliceOpTest, WithoutConstDim) {
    {1, 7, 2},
    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
    {-2, -1, 0, 1, 2}, 0,
+    {0, 1, 2, 3, 4, 1, 2, 3, 4, 5, 2, 3, 4, 5, 6},
+    {},
    {1, 3, 10},
    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
     3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
@@ -68,8 +74,10 @@ TEST_F(SpliceOpTest, WithConstDim) {
     4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
     5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
    {-2, -1, 0, 1, 2}, 7,
+    {0, 1, 2, 3, 4},
+    {2},
    {1, 1, 22},
-    {1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10});
+    {1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8, 9, 10, 11, 12});
 }
 }  // namespace test
 }  // namespace ops

--- a/tools/extract_pooling.py
+++ b/tools/extract_pooling.py
+# Copyright 2019 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+variance_floor = 1.0e-10
+
+input_data = np.arange(180).reshape(3, 20, 3).astype(np.float32)
+print("input data:", input_data)
+num_log_count = 0
+include_var = 0
+forward_indexes = [0, 6, 2, 6]
+counts = [6, 4]
+
+input_dim = input_data.shape[-1]
+input_chunk = input_data.shape[-2]
+
+out_chunk = len(counts)
+batch = input_data.size / (input_dim * input_chunk)
+input_data.reshape(batch, input_chunk, input_dim)
+
+output_dim = input_dim
+
+if include_var > 0:
+    output_dim += input_dim
+if num_log_count > 0:
+    output_dim += num_log_count
+
+output_data = np.zeros((batch, out_chunk, output_dim), dtype=np.float32)
+
+for b in range(0, batch):
+    for i in range(0, out_chunk):
+        start = forward_indexes[2 * i]
+        end = forward_indexes[2 * i + 1]
+        count = counts[i]
+        mean_scale = 1.0 / count
+        log_count = math.log(count)
+        if num_log_count > 0:
+            for n in range(0, num_log_count):
+                output_data[b, i, n] = log_count
+        for d in range(0, input_dim):
+            mean = 0.0
+            variance = 0.0
+            for t in range(start, end):
+                x = input_data[b, t, d]
+                mean += x
+                variance += x * x
+            mean = mean * mean_scale
+            output_data[b, i, d + num_log_count] = mean
+            if include_var > 0:
+                variance = variance * mean_scale - mean * mean
+                idx = d + input_dim + num_log_count
+                if variance < variance_floor:
+                    output_data[b, i, idx] = math.sqrt(variance_floor)
+                else:
+                    output_data[b, i, idx] = math.sqrt(variance)
+print("output data:", output_data)
+print("output data shape:", output_data.shape)
--- a/tools/python/transform/base_converter.py
+++ b/tools/python/transform/base_converter.py
@@ -100,7 +100,6 @@ MaceSupportedOps = [
    'Conv2D',
    'Crop',
    'Deconv2D',
-    'Delay',
    'DepthToSpace',
    'DepthwiseConv2d',
    'DepthwiseDeconv2d',
@@ -112,6 +111,7 @@ MaceSupportedOps = [
    'FullyConnected',
    'Gather',
    'Identity',
+    'IfDefined',
    'InferConv2dShape',
    'KaldiBatchNorm',
    'LocalResponseNorm',
@@ -128,6 +128,7 @@ MaceSupportedOps = [
    'Proposal',
    'Quantize',
    'Reduce',
+    'ReplaceIndex',
    'Reshape',
    'ResizeBicubic',
    'ResizeBilinear',
@@ -147,6 +148,7 @@ MaceSupportedOps = [
    'SpaceToBatchND',
    'SpaceToDepth',
    'SqrDiffMean',
+    'Subsample',
    'SumGroup',
    'TargetRMSNorm',
    'Transpose',
@@ -269,6 +271,8 @@ class MaceKeyword(object):
    mace_reverse_str = 'reverse'
    mace_const_data_num_arg_str = 'const_data_num'
    mace_coeff_str = 'coeff'
+    mace_input_indexes_str = 'input_indexes'
+    mace_output_indexes_str = 'output_indexes'
    mace_p_str = 'p'
    mace_nor_var_str = 'normalize_variance'
    mace_across_ch_str = 'across_channels'

--- a/tools/python/transform/onnx_converter.py
+++ b/tools/python/transform/onnx_converter.py
@@ -152,7 +152,9 @@ OnnxSupportedOps = [
    # 'ReduceSum',
    # 'ReduceSumSquare',
    'Relu',
+    'ReplaceIndex',
    'Reshape',
+    'Round',
    'Scale',
    # 'Scan',
    # 'Selu',
@@ -171,6 +173,7 @@ OnnxSupportedOps = [
    'Sqrt',
    'Squeeze',
    'Sub',
+    'Subsample',
    'Sum',
    'SumGroup',
    # 'Tan',
@@ -363,7 +366,7 @@ class OnnxConverter(base_converter.ConverterInterface):
            OnnxOpType.Mul.name: self.convert_eltwise,
            OnnxOpType.Neg.name: self.convert_eltwise,
            OnnxOpType.Normalize: self.convert_normalize,
-            OnnxOpType.Offset.name: self.convert_identity,
+            OnnxOpType.Offset.name: self.convert_subsample,
            OnnxOpType.Pad.name: self.convert_pad,
            OnnxOpType.PadContext.name: self.convert_pad_context,
            OnnxOpType.PNorm.name: self.convert_pnorm,
@@ -376,6 +379,8 @@ class OnnxConverter(base_converter.ConverterInterface):
            OnnxOpType.ReduceMean.name: self.convert_reduce,
            OnnxOpType.ReduceMin.name: self.convert_reduce,
            OnnxOpType.ReduceProd.name: self.convert_reduce,
+            OnnxOpType.ReplaceIndex.name: self.convert_replaceindex,
+            OnnxOpType.Round.name: self.convert_replaceindex,
            OnnxOpType.Scale.name: self.convert_eltwise,
            OnnxOpType.Shape.name: self.convert_shape,
            OnnxOpType.Sigmoid.name: self.convert_activation,
@@ -387,6 +392,7 @@ class OnnxConverter(base_converter.ConverterInterface):
            OnnxOpType.Sqrt.name: self.convert_eltwise,
            OnnxOpType.Squeeze.name: self.convert_squeeze,
            OnnxOpType.Sub.name: self.convert_eltwise,
+            OnnxOpType.Subsample.name: self.convert_subsample,
            OnnxOpType.Sum.name: self.convert_eltwise,
            OnnxOpType.SumGroup.name: self.convert_sum_group,
            OnnxOpType.Tanh.name: self.convert_activation,
@@ -839,56 +845,30 @@ class OnnxConverter(base_converter.ConverterInterface):
        op = self.convert_general_op(node)
        op.type = MaceOp.DynamicLSTM.name

-        if 'prev_out_delay' in node.attrs:
-            prev_out_delay = node.attrs['prev_out_delay']
-            mace_check(prev_out_delay < 0,
-                       "dynamic's prev_out_delay should <= 0.")
-            prev_out_delay_arg = op.arg.add()
-            prev_out_delay_arg.name = 'prev_out_delay'
-            prev_out_delay_arg.i = prev_out_delay
-        if 'prev_cell_delay' in node.attrs:
-            prev_cell_delay = node.attrs['prev_cell_delay']
-            mace_check(prev_cell_delay < 0,
-                       "dynamic's prev_cell_delay should < 0.")
-            prev_cell_delay_arg = op.arg.add()
-            prev_cell_delay_arg.name = 'prev_cell_delay'
-            prev_cell_delay_arg.i = prev_cell_delay
-        if 'prev_out_offset' in node.attrs:
-            prev_out_offset = node.attrs['prev_out_offset']
-            mace_check(prev_out_offset >= 0,
-                       "dynamic's prev_out_offset should >= 0.")
-            prev_out_offset_arg = op.arg.add()
-            prev_out_offset_arg.name = 'prev_out_offset'
-            prev_out_offset_arg.i = prev_out_offset
-        if 'prev_out_dim' in node.attrs:
-            prev_out_dim = node.attrs['prev_out_dim']
-            mace_check(prev_out_dim > 0,
-                       "dynamic's prev_out_dim should > 0.")
-            prev_out_dim_arg = op.arg.add()
-            prev_out_dim_arg.name = 'prev_out_dim'
-            prev_out_dim_arg.i = prev_out_dim
-        if 'prev_cell_dim' in node.attrs:
-            prev_cell_dim = node.attrs['prev_cell_dim']
-            mace_check(prev_cell_dim > 0,
-                       "dynamic's prev_cell_dim should > 0.")
-            prev_cell_dim_arg = op.arg.add()
-            prev_cell_dim_arg.name = 'prev_cell_dim'
-            prev_cell_dim_arg.i = prev_cell_dim
-        if 'bias_a' in node.attrs:
-            bias_a = node.attrs['bias_a']
-            bias_a_arg = op.arg.add()
-            bias_a_arg.name = 'bias_a'
-            bias_a_arg.i = bias_a
-        if 'bias_b' in node.attrs:
-            bias_b = node.attrs['bias_b']
-            bias_b_arg = op.arg.add()
-            bias_b_arg.name = 'bias_b'
-            bias_b_arg.i = bias_b
-        if 'scale' in node.attrs:
-            scale = node.attrs['scale']
-            scale_arg = op.arg.add()
-            scale_arg.name = 'scale'
-            scale_arg.f = scale
+        self.copy_node_attr(op, node, 'prev_out_delay',
+                            AttributeType.INT)
+        self.copy_node_attr(op, node, 'prev_cell_delay',
+                            AttributeType.INT)
+        self.copy_node_attr(op, node, 'prev_out_offset',
+                            AttributeType.INT)
+        self.copy_node_attr(op, node, 'prev_out_dim',
+                            AttributeType.INT)
+        self.copy_node_attr(op, node, 'prev_cell_dim',
+                            AttributeType.INT)
+        self.copy_node_attr(op, node, 'bias_a',
+                            AttributeType.INT)
+        self.copy_node_attr(op, node, 'bias_b',
+                            AttributeType.INT)
+        self.copy_node_attr(op, node, 'scale',
+                            AttributeType.FLOAT)
+        self.copy_node_attr(op, node, 'subsample_factor',
+                            AttributeType.INT, default=1)
+        self.copy_node_attr(op, node, 'cell_cache_indexes',
+                            AttributeType.INTS, default=[])
+        self.copy_node_attr(op, node, 'out_cache_indexes',
+                            AttributeType.INTS, default=[])
+        self.copy_node_attr(op, node, 'forward_indexes',
+                            AttributeType.INTS)

    def convert_clip(self, node):
        #  If clip's min value is zero,
@@ -1019,73 +999,8 @@ class OnnxConverter(base_converter.ConverterInterface):
        self.copy_node_attr(op, node, 'include_variance', AttributeType.INT)
        self.copy_node_attr(op, node, 'num_log_count', AttributeType.INT)
        self.copy_node_attr(op, node, 'variance_floor', AttributeType.FLOAT)
-        self.copy_node_attr(op, node, 'input_time_range', AttributeType.INTS)
-        self.copy_node_attr(op, node, 'input_indexes', AttributeType.INTS)
-
-        if 'output_time_range' in node.attrs:
-            output_time_range = node.attrs['output_time_range']
-            mace_check(len(output_time_range) == 2,
-                       "output time range should have two values.")
-            out_start_index = output_time_range[0]
-            out_end_index = output_time_range[1]
-        else:
-            mace_check('start_index' in node.attrs and
-                       'end_index' in node.attrs,
-                       "'start_index' and 'end_index'"
-                       " are required in ExtractPooling.")
-            out_start_index = node.attrs['start_index']
-            out_end_index = node.attrs['end_index']
-            output_time_range = [out_start_index, out_end_index]
-
-        output_time_range_arg = op.arg.add()
-        output_time_range_arg.name = 'output_time_range'
-        output_time_range_arg.ints.extend(output_time_range)
-
-        mace_check('modulus' in node.attrs,
-                   "'modulus' is required in ExtractPooling.")
-        mace_check('output_indexes' in node.attrs,
-                   "'output_indexes' is required in ExtractPooling.")
-        mace_check('counts' in node.attrs,
-                   "'counts' is required in ExtractPooling.")
-        mace_check('forward_indexes' in node.attrs,
-                   "'forward_indexes' is required in ExtractPooling.")
-        modulus = node.attrs['modulus']
-        output_indexes = node.attrs['output_indexes']
-        counts = node.attrs['counts']
-        forward_indexes = node.attrs['forward_indexes']
-
-        mace_check(len(counts) == len(output_indexes) and
-                   len(forward_indexes) == 2 * len(output_indexes),
-                   "output_indexes length:%s "
-                   "counts length:%s "
-                   "forward_indexes length:%s"
-                   % (len(output_indexes), len(counts), len(forward_indexes)))
-
-        new_output_indexes = []
-        new_forward_indexes = []
-        new_counts = []
-        for i in range(len(output_indexes)):
-            if output_indexes[i] + modulus > out_start_index and\
-                    output_indexes[i] <= out_end_index:
-                new_output_indexes.append(output_indexes[i])
-                new_counts.append(counts[i])
-                new_forward_indexes.append(forward_indexes[2 * i])
-                new_forward_indexes.append(forward_indexes[2 * i + 1])
-        modulus_arg = op.arg.add()
-        modulus_arg.name = 'modulus'
-        modulus_arg.i = modulus
-
-        counts_arg = op.arg.add()
-        counts_arg.name = 'counts'
-        counts_arg.floats.extend(new_counts)
-
-        forward_indexes_arg = op.arg.add()
-        forward_indexes_arg.name = 'forward_indexes'
-        forward_indexes_arg.ints.extend(new_forward_indexes)
-
-        output_indexes_arg = op.arg.add()
-        output_indexes_arg.name = 'output_indexes'
-        output_indexes_arg.ints.extend(new_output_indexes)
+        self.copy_node_attr(op, node, 'counts', AttributeType.FLOATS)
+        self.copy_node_attr(op, node, 'forward_indexes', AttributeType.INTS)

    def convert_flatten(self, node):
        op = self.convert_general_op(node)
@@ -1104,19 +1019,14 @@ class OnnxConverter(base_converter.ConverterInterface):
    def convert_kaldi_batchnorm(self, node):
        op = self.convert_general_op(node)
        op.type = MaceOp.KaldiBatchNorm.name
-        dim = self.copy_node_attr(op, node,
-                                  'dim', AttributeType.INT, -1)
-        block_dim = self.copy_node_attr(op, node,
-                                        'block_dim',
+        dim = self.copy_node_attr(op, node, 'dim', AttributeType.INT, -1)
+        block_dim = self.copy_node_attr(op, node, 'block_dim',
                                        AttributeType.INT, -1)
-        epsilon = self.copy_node_attr(op, node,
-                                      'epsilon',
+        epsilon = self.copy_node_attr(op, node, 'epsilon',
                                      AttributeType.FLOAT, 1e-3)
-        target_rms = self.copy_node_attr(op, node,
-                                         'target_rms',
+        target_rms = self.copy_node_attr(op, node, 'target_rms',
                                         AttributeType.FLOAT, 1.0)
-        test_mode = self.copy_node_attr(op, node,
-                                        'test_mode',
+        test_mode = self.copy_node_attr(op, node, 'test_mode',
                                        AttributeType.INT, 0)
        mace_check(block_dim > 0 and
                   dim % block_dim == 0 and
@@ -1165,8 +1075,7 @@ class OnnxConverter(base_converter.ConverterInterface):

        scale_name = node.name + 'scale'
        offset_name = node.name + 'offset'
-        scale_value = (
-                (1.0 / np.sqrt(
+        scale_value = ((1.0 / np.sqrt(
                    var_value + epsilon_value)) * gamma_value)
        offset_value = (-mean_value * scale_value) + beta_value
        self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
@@ -1267,10 +1176,11 @@ class OnnxConverter(base_converter.ConverterInterface):
        if offset == 0:
            op.type = MaceOp.Identity.name
        else:
-            op.type = MaceOp.Delay.name
-        offset_arg = op.arg.add()
-        offset_arg.name = 'offset'
-        offset_arg.i = node.attrs['offset']
+            op.type = MaceOp.IfDefined.name
+            self.copy_node_attr(op, node, 'forward_indexes',
+                                AttributeType.INTS)
+            self.copy_node_attr(op, node, 'cache_forward_indexes',
+                                AttributeType.INTS)

    def convert_imagescaler(self, node):
        op = self.convert_general_op(node)
@@ -1282,10 +1192,10 @@ class OnnxConverter(base_converter.ConverterInterface):

        scale_name = node.name + "_scale"
        bias_name = node.name + "_bias"
-        self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
-                        scale_value)
-        self.add_tensor(bias_name, bias_value.shape, mace_pb2.DT_FLOAT,
-                        bias_value)
+        self.add_tensor(scale_name, scale_value.shape,
+                        mace_pb2.DT_FLOAT, scale_value)
+        self.add_tensor(bias_name, bias_value.shape,
+                        mace_pb2.DT_FLOAT, bias_value)
        op.input.extend([scale_name, bias_name])

    def convert_lstm(self, node):
@@ -1399,6 +1309,12 @@ class OnnxConverter(base_converter.ConverterInterface):
        keep_dims_arg.name = MaceKeyword.mace_keepdims_str
        keep_dims_arg.i = keep_dims

+    def convert_replaceindex(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.ReplaceIndex.name
+        self.copy_node_attr(op, node, 'forward_indexes',
+                            AttributeType.INTS)
+
    def convert_reshape(self, node):
        op = self.convert_general_op(node)
        op.type = MaceOp.Reshape.name
@@ -1460,11 +1376,17 @@ class OnnxConverter(base_converter.ConverterInterface):
        context_arg.ints.extend(context)
        if 'const_component_dim' in node.attrs:
            const_dim = node.attrs['const_component_dim']
-        else:
-            const_dim = 0
-        const_dim_arg = op.arg.add()
-        const_dim_arg.name = 'const_component_dim'
-        const_dim_arg.i = const_dim
+            const_dim_arg = op.arg.add()
+            const_dim_arg.name = 'const_component_dim'
+            const_dim_arg.i = const_dim
+            self.copy_node_attr(op, node,
+                                'forward_const_indexes',
+                                AttributeType.INTS)
+
+        self.copy_node_attr(op, node, 'subsample_factor',
+                            AttributeType.INT, default=1)
+        self.copy_node_attr(op, node, 'forward_indexes',
+                            AttributeType.INTS)

    def convert_split(self, node):
        op = self.convert_general_op(node)
@@ -1516,6 +1438,12 @@ class OnnxConverter(base_converter.ConverterInterface):
            axis_arg.name = MaceKeyword.mace_axis_str
            axis_arg.ints.extend(axis_value)

+    def convert_subsample(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Subsample.name
+        self.copy_node_attr(op, node, 'forward_indexes',
+                            AttributeType.INTS)
+
    def convert_sum_group(self, node):
        op = self.convert_general_op(node)
        op.type = MaceOp.SumGroup.name
@@ -1524,11 +1452,12 @@ class OnnxConverter(base_converter.ConverterInterface):
        op = self.convert_general_op(node)
        op.type = MaceOp.TargetRMSNorm.name

-        self.copy_node_attr(op, node, 'target_rms', AttributeType.FLOAT)
-        self.copy_node_attr(op, node, 'add_log_stddev', AttributeType.INT,
-                            default=0)
-        self.copy_node_attr(op, node, 'block_dim', AttributeType.INT,
-                            default=0)
+        self.copy_node_attr(op, node, 'target_rms',
+                            AttributeType.FLOAT)
+        self.copy_node_attr(op, node, 'add_log_stddev',
+                            AttributeType.INT, default=0)
+        self.copy_node_attr(op, node, 'block_dim',
+                            AttributeType.INT, default=0)

    def convert_transpose(self, node):
        op = self.convert_general_op(node)