提交 6c330850 编写于 作者: L liutuo

add dynamic lstm cell

support kaldi tdnn-lstm

support logsoftmax

add ifdefined op

add dynamic lstm docs
上级 c35775c7
Dynamic LSTM
==================
The DynamicLSTM in MACE is implemented for Kaldi's time delay RNN models.
The following pictures explain how to fuse components into a DynamicLSTMCell.
Before fusing:
<div align="left">
<img src="imgs/FuseLSTM.png" width = "320" height = "960" alt="how to fuse lstm" />
</div>
After fusing:
<div align="left">
<img src="imgs/DynamicLSTM.png" width = "358" height = "391" alt="DynamicLSTM" />
</div>
For more details about LSTMNonlinear in Kaldi,
please refer to [LstmNonlinearComponent](http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164)
\ No newline at end of file
...@@ -131,6 +131,9 @@ int64_t StatMACs(const std::string &op_type, ...@@ -131,6 +131,9 @@ int64_t StatMACs(const std::string &op_type,
output_shape.end(), output_shape.end(),
1, 1,
std::multiplies<int64_t>()); std::multiplies<int64_t>());
} else if (op_type == "DynamicLSTM") {
macs = output_shape[0] * (filter_shape[0] * filter_shape[1]
+ output_shape[1] * filter_shape[0] / 4);
} }
return macs; return macs;
} }
......
...@@ -48,8 +48,8 @@ MaceStatus Gemv::Compute(const OpContext *context, ...@@ -48,8 +48,8 @@ MaceStatus Gemv::Compute(const OpContext *context,
Tensor *output) { Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
MACE_CHECK(output->size() == batch * lhs_height, MACE_CHECK(output->size() >= batch * lhs_height,
"Need resize output tensor before call gemv."); "Output buffer is not large enough for computing gemv.");
Tensor::MappingGuard lhs_guard(lhs); Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs); Tensor::MappingGuard rhs_guard(rhs);
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Details are in
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#include "mace/ops/common/lstm.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
void LSTMNonlinearKernel(const float *input_data,
const float *prev_data,
const float *scale_data,
const float *params_data,
bool embed_scales,
index_t params_stride,
index_t cell_dim,
float *output_cell,
float *output_data) {
float i_scale = (embed_scales && scale_data) ? scale_data[0] : 1.0f;
float f_scale = (embed_scales && scale_data) ? scale_data[1] : 1.0f;
float o_scale = (embed_scales && scale_data) ? scale_data[2] : 1.0f;
if (prev_data == nullptr) {
#pragma omp parallel for schedule(runtime)
for (int c = 0; c < cell_dim; ++c) {
float i_part = input_data[c];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part);
float c_t = i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
} else {
#pragma omp parallel for schedule(runtime)
for (int c = 0; c < cell_dim; ++c) {
float i_part = input_data[c];
float f_part = input_data[c + cell_dim];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float c_prev = prev_data[c];
float w_ic = params_data[c];
float w_fc = params_data[c + params_stride];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
float c_t =
f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
}
}
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_COMMON_LSTM_H_
#define MACE_OPS_COMMON_LSTM_H_
#include "mace/core/types.h"
namespace mace {
namespace ops {
void LSTMNonlinearKernel(const float *input_data,
const float *prev_data,
const float *scale_data,
const float *params_data,
bool embed_scales,
index_t params_stride,
index_t cell_dim,
float *output_cell,
float *output_data);
} // namespace ops
} // namespace mace
#endif // MACE_OPS_COMMON_LSTM_H_
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This Op is for Fused-LstmNonlinearComponent
// with prev cell states as inputs in Kaldi.
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
// More details are in docs/development/dynamic_lstm.md
#include <functional>
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/common/lstm.h"
#ifdef MACE_ENABLE_NEON
#include <arm_neon.h>
#include "mace/ops/arm/fp32/gemv.h"
#else
#include "mace/ops/ref/gemv.h"
#endif // MACE_ENABLE_NEON
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class DynamicLSTMOp;
template <typename T>
class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
public:
explicit DynamicLSTMOp(OpConstructContext *context)
: Operation(context),
prev_out_delay_(
Operation::GetOptionalArg<int>("prev_out_delay", 0)),
prev_cell_delay_(
Operation::GetOptionalArg<int>("prev_cell_delay", 0)),
prev_out_offset_(Operation::GetOptionalArg<int>("prev_out_offset", 0)),
prev_out_dim_(Operation::GetOptionalArg<int>("prev_out_dim", 0)),
prev_cell_dim_(Operation::GetOptionalArg<int>("prev_cell_dim", 0)),
has_bias_a_(Operation::GetOptionalArg<int>("bias_a", 1)),
has_bias_b_(Operation::GetOptionalArg<int>("bias_b", 1)),
scale_(Operation::GetOptionalArg<float>("scale", 1.0f)) {}
void UpdateCell(float *cell_data,
const index_t cell_dim,
const float scale) {
if (std::abs(scale - 1.f) < 1e-6)
return;
const index_t rounds = cell_dim / 4;
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON
float32x4_t in_vec = vld1q_f32(cell_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec);
vst1q_f32(cell_data + i, in_vec);
#else
for (int j = 0; j < 4; ++j) {
cell_data[i + j] *= scale;
}
#endif
}
for (index_t i = rounds * 4; i < cell_dim; ++i) {
cell_data[i] *= scale;
}
}
void CopyAndUpdateCell(float *src_data,
const index_t cell_dim,
const float scale,
float *cell_data) {
if (std::abs(scale - 1.f) < 1e-6) {
memcpy(cell_data, src_data, cell_dim * sizeof(float));
return;
}
const index_t rounds = cell_dim / 4;
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON
float32x4_t in_vec = vld1q_f32(src_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec);
vst1q_f32(cell_data + i, in_vec);
#else
for (int j = 0; j < 4; ++j) {
cell_data[i + j] = src_data[i + j] * scale;
}
#endif
}
for (index_t i = rounds * 4; i < cell_dim; ++i) {
cell_data[i] = src_data[i] * scale;
}
}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
int max_input_num = 4;
MACE_CHECK(this->InputSize() >= max_input_num,
"DynamicLSTM has at least four inputs.");
MACE_CHECK(prev_cell_delay_ < 0 && prev_out_delay_ < 0);
MACE_CHECK(prev_out_dim_ > 0 && prev_cell_dim_ > 0);
const Tensor *input = this->Input(INPUT);
const Tensor *weights_a = this->Input(WEIGHTS_A);
const Tensor *lstm_params = this->Input(PARAMS);
const Tensor *weights_b = this->Input(WEIGHTS_B);
if (has_bias_a_) {
max_input_num++;
MACE_CHECK(this->InputSize() >= max_input_num,
"The first affine needs a bias input.");
}
const Tensor *bias_a = has_bias_a_ ?
this->Input(max_input_num - 1) :
nullptr;
if (has_bias_b_) {
max_input_num++;
MACE_CHECK(this->InputSize() >= max_input_num,
"The second affine needs a bias input.");
}
const Tensor *bias_b = has_bias_b_ ?
this->Input(max_input_num - 1) :
nullptr;
const index_t input_rank = input->dim_size();
MACE_CHECK(input_rank >= 2,
"Dynamic LSTM Cell's input dim size should be >= 2.");
const std::vector<index_t> &input_shape = input->shape();
const index_t batch =
std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
std::multiplies<index_t>());
const index_t chunk = input_shape[input_rank - 2];
const index_t input_dim = input_shape[input_rank - 1];
const index_t affine_a_in_dim = input_dim + prev_out_dim_;
const index_t affine_a_out_dim = weights_a->dim(0);
const index_t affine_a_depth = weights_a->dim(1);
MACE_CHECK(affine_a_in_dim == affine_a_depth)
<< "affine_a's input_dim:" << affine_a_in_dim
<< "!=" << "affine_a's weights' depth:" << affine_a_depth << std::endl;
const index_t lstm_input_dim = affine_a_out_dim + prev_cell_dim_;
const index_t lstm_cell_dim = lstm_input_dim / 5;
const index_t params_stride = lstm_params->dim(1);
MACE_CHECK(lstm_input_dim == (lstm_cell_dim * 5));
MACE_CHECK(lstm_params->dim(0) == 3 &&
params_stride == lstm_cell_dim && lstm_cell_dim == prev_cell_dim_)
<< "lstm params rows:" << lstm_params->dim(0)
<< "params_stride:"<< params_stride
<< "!=" << "cell_dim:"<< lstm_cell_dim << std::endl;
const index_t affine_b_out_dim = weights_b->dim(0);
const index_t affine_b_depth = weights_b->dim(1);
const index_t affine_b_in_dim = lstm_cell_dim;
MACE_CHECK(affine_b_in_dim == affine_b_depth)
<< "affine_b's input_dim:" << affine_b_in_dim
<< "!=" << "affine_b's weights' depth:" << affine_b_depth << std::endl;
const index_t output_dim = affine_b_out_dim;
MACE_CHECK(prev_out_offset_ + prev_out_dim_ <= output_dim);
const index_t affine_a_in_size =
PadAlignSize(affine_a_in_dim * sizeof(float));
const index_t affine_a_out_size =
PadAlignSize(affine_a_out_dim * sizeof(float));
const index_t affine_b_in_size =
PadAlignSize(affine_b_in_dim * sizeof(float));
const index_t affine_b_out_size =
PadAlignSize(affine_b_out_dim * sizeof(float));
const int out_buf_chunk = abs(prev_out_delay_);
const int cell_buf_chunk = abs(prev_cell_delay_);
const index_t out_buf_size =
PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float));
const index_t cell_buf_size =
PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(float));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(affine_a_in_size + affine_a_out_size
+ affine_b_in_size + affine_b_out_size
+ out_buf_size + cell_buf_size);
Tensor prev_out(scratch->Scratch(out_buf_size), DT_FLOAT);
prev_out.Reshape({out_buf_chunk, prev_out_dim_});
float *prev_out_data = prev_out.mutable_data<float>();
Tensor prev_cell(scratch->Scratch(cell_buf_size), DT_FLOAT);
prev_cell.Reshape({cell_buf_chunk, prev_cell_dim_});
float *prev_cell_data = prev_cell.mutable_data<float>();
Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT);
affine_a_in.Reshape({1, affine_a_in_dim});
float *affine_a_in_data = affine_a_in.mutable_data<float>();
Tensor affine_a_out(scratch->Scratch(affine_a_out_size), DT_FLOAT);
affine_a_out.Reshape({1, affine_a_out_dim});
float *affine_a_out_data = affine_a_out.mutable_data<float>();
Tensor affine_b_in(scratch->Scratch(affine_b_in_size), DT_FLOAT);
affine_b_in.Reshape({1, affine_b_in_dim});
float *affine_b_in_data = affine_b_in.mutable_data<float>();
Tensor affine_b_out(scratch->Scratch(affine_b_out_size), DT_FLOAT);
affine_b_out.Reshape({1, affine_b_out_dim});
float *affine_b_out_data = affine_b_out.mutable_data<float>();
Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape = input->shape();
output_shape[1] = output_dim;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard lstm_params_guard(lstm_params);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>();
const float *lstm_params_data = lstm_params->data<float>();
float *output_data = output->mutable_data<float>();
for (int b = 0; b < batch; ++b) {
int prev_out_idx = prev_out_delay_;
int prev_cell_idx = prev_cell_delay_;
prev_cell.Clear();
prev_out.Clear();
affine_a_in.Clear();
affine_a_out.Clear();
affine_b_in.Clear();
affine_b_out.Clear();
for (int i = 0; i < chunk; ++i) {
// Append
memcpy(affine_a_in_data, input_data, input_dim * sizeof(float));
if (prev_out_idx >= 0) {
memcpy(affine_a_in_data + input_dim,
prev_out_data + prev_out_idx % out_buf_chunk * prev_out_dim_,
prev_out_dim_ * sizeof(float));
}
// Affine
gemv_.Compute(context,
weights_a,
&affine_a_in,
bias_a,
1,
affine_a_out_dim,
affine_a_depth,
false,
false,
&affine_a_out);
// Prepare LSTMNonlinear input and output pointer
float *prev_cell_ptr =
prev_cell_idx < 0 ? nullptr :
prev_cell_data + prev_cell_idx % cell_buf_chunk * prev_cell_dim_;
float *curr_cell_ptr =
prev_cell_data + i % cell_buf_chunk * prev_cell_dim_;
// LSTMNonlinear
LSTMNonlinearKernel(affine_a_out_data,
prev_cell_ptr,
nullptr,
lstm_params_data,
false,
params_stride,
lstm_cell_dim,
curr_cell_ptr,
affine_b_in_data);
UpdateCell(curr_cell_ptr, prev_cell_dim_, scale_);
// Affine
gemv_.Compute(context,
weights_b,
&affine_b_in,
bias_b,
1,
affine_b_out_dim,
affine_b_depth,
false,
false,
&affine_b_out);
// Output
memcpy(output_data,
affine_b_out_data,
output_dim * sizeof(float));
// Update
float *curr_out_ptr = prev_out_data + i % out_buf_chunk * prev_out_dim_;
CopyAndUpdateCell(affine_b_out_data + prev_out_offset_,
prev_out_dim_,
scale_,
curr_out_ptr);
input_data += input_dim;
output_data += output_dim;
prev_out_idx++;
prev_cell_idx++;
}
}
return MaceStatus::MACE_SUCCESS;
}
private:
int prev_out_delay_;
int prev_cell_delay_;
int prev_out_offset_;
int prev_out_dim_;
int prev_cell_dim_;
int has_bias_a_;
int has_bias_b_;
float scale_;
#ifdef MACE_ENABLE_NEON
arm::fp32::Gemv gemv_;
#else
ref::Gemv<float> gemv_;
#endif // MACE_ENABLE_NEON
MACE_OP_INPUT_TAGS(INPUT, WEIGHTS_A, PARAMS, WEIGHTS_B);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
void RegisterDynamicLSTM(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
DeviceType::CPU, float);
}
} // namespace ops
} // namespace mace
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/lstmcell_test_util.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
namespace {
template <DeviceType D, typename T>
void DynamicLSTM(int iters,
int chunk,
int input_dim,
int output_dim,
int cell_dim,
int prev_out_dim,
int delay) {
mace::testing::StopTiming();
OpsTestNet net;
MACE_CHECK(prev_out_dim <= output_dim);
const int weights_a_rows = 4 * cell_dim;
const int weights_a_cols = input_dim + prev_out_dim;
const int bias_a_rows = weights_a_rows;
const int weights_b_rows = output_dim;
const int weights_b_cols = cell_dim;
const int bias_b_rows = weights_b_rows;
// Add input data
net.AddRandomInput<D, float>("Input", {chunk, input_dim});
net.AddRandomInput<D, float>("Weight_A",
{weights_a_rows, weights_a_cols},
true);
net.AddRandomInput<D, float>("Params",
{3, cell_dim},
true);
net.AddRandomInput<D, float>("Weight_B",
{weights_b_rows, weights_b_cols},
true);
net.AddRandomInput<D, float>("Bias_A", {bias_a_rows}, true);
net.AddRandomInput<D, float>("Bias_B", {bias_b_rows}, true);
if (D == DeviceType::CPU) {
OpDefBuilder("DynamicLSTM", "DynamicLSTMTest")
.Input("Input")
.Input("Weight_A")
.Input("Params")
.Input("Weight_B")
.Input("Bias_A")
.Input("Bias_B")
.Output("Output")
.AddIntArg("prev_out_delay", -delay)
.AddIntArg("prev_cell_delay", -delay)
.AddIntArg("prev_out_dim", prev_out_dim)
.AddIntArg("prev_cell_dim", cell_dim)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
}
net.Sync();
}
} // namespace
#define MACE_BM_DYNAMIC_LSTM_MACRO( \
N, ID, OD, CD, POD, DELAY, TYPE, DEVICE) \
static void \
MACE_BM_DYNAMIC_LSTM_##N##_##ID##_##OD##_##CD##_##POD##_##DELAY##_##TYPE\
##_##DEVICE( \
int iters) { \
int64_t wa_size = 4 * CD * (ID + POD); \
int64_t wb_size = OD * CD; \
int64_t prev_size = DELAY * (POD + CD); \
int64_t in_out_size = N * (ID + OD); \
int64_t bias_size = 4 * CD + OD; \
const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("DynamicLSTM", {4 * CD, ID + POD}, {N, OD});\
const int64_t tot = static_cast<int64_t>(iters) * (in_out_size + prev_size\
+ wa_size + wb_size + bias_size); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \
DynamicLSTM<DEVICE, TYPE>(iters, N, ID, OD, CD, POD, DELAY); \
} \
MACE_BENCHMARK( \
MACE_BM_DYNAMIC_LSTM_##N##_##ID##_##OD##_##CD##_##POD##_##DELAY \
##_##TYPE##_##DEVICE)
#define MACE_BM_DYNAMIC_LSTM(N, ID, OD, CD, POD, DELAY) \
MACE_BM_DYNAMIC_LSTM_MACRO(N, ID, OD, CD, POD, DELAY, float, CPU);
MACE_BM_DYNAMIC_LSTM(50, 184, 128, 184, 64, 3);
MACE_BM_DYNAMIC_LSTM(50, 64, 256, 64, 128, 3);
MACE_BM_DYNAMIC_LSTM(80, 64, 256, 128, 64, 3);
} // namespace test
} // namespace ops
} // namespace mace
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This Op is for LstmNonlinearComponent in Kaldi.
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#include <functional>
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/common/lstm.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class LSTMNonlinearOp;
template <typename T>
class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
public:
explicit LSTMNonlinearOp(OpConstructContext *context)
: Operation(context) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
const Tensor *input = this->Input(INPUT);
MACE_CHECK(this->InputSize() >= 2,
"LSTMNonlinear should have at least 2 inputs.");
const Tensor *params = this->Input(PARAMS);
Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input->dim_size() >= 2)
<< "The input dim size should >= 2";
MACE_CHECK(params->dim_size() == 2)
<< "The params dim size should be 2";
return Compute(input, params, output);
}
MaceStatus Compute(const Tensor *input,
const Tensor *params,
Tensor *output) {
const std::vector<index_t> &input_shape = input->shape();
const std::vector<index_t> &params_shape = params->shape();
const index_t num_rows =
std::accumulate(input_shape.begin(), input_shape.end() - 1, 1,
std::multiplies<index_t>());
index_t rank = input->dim_size();
const index_t input_cols = input_shape[rank - 1];
const index_t cell_dim = input_cols / 5;
bool embed_scales = input_cols == cell_dim * 5 + 3;
const index_t params_stride = params_shape[1];
MACE_CHECK(input_cols == (cell_dim * 5) || embed_scales);
MACE_CHECK(params_shape[0] == 3 && params_shape[1] == cell_dim);
const index_t output_dim = cell_dim * 2;
std::vector<index_t> output_shape = input->shape();
output_shape[rank - 1] = output_dim;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard params_guard(params);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<T>();
const float *params_data = params->data<T>();
float *output_data = output->mutable_data<T>();
#pragma omp parallel for schedule(runtime)
for (int r = 0; r < num_rows; ++r) {
const float *input_row = input_data + r * input_cols;
const float *prev_row = input_row + 4 * cell_dim;
const float *scale_data =
embed_scales ? prev_row + cell_dim : nullptr;
float *output_cell = output_data + r * output_dim;
float *output_row = output_cell + cell_dim;
LSTMNonlinearKernel(input_row,
prev_row,
scale_data,
params_data,
embed_scales,
params_stride,
cell_dim,
output_cell,
output_row);
}
return MaceStatus::MACE_SUCCESS;
}
protected:
MACE_OP_INPUT_TAGS(INPUT, PARAMS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
void RegisterLSTMNonlinear(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "LSTMNonlinearOp", LSTMNonlinearOp,
DeviceType::CPU, float);
}
} // namespace ops
} // namespace mace
...@@ -12,10 +12,8 @@ ...@@ -12,10 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include <vector>
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/lstmcell_test_util.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
...@@ -23,22 +21,31 @@ namespace ops { ...@@ -23,22 +21,31 @@ namespace ops {
namespace test { namespace test {
namespace { namespace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TimeOffsetBenchmark(int iters, void LSTMNonlinear(int iters,
std::vector<index_t> shape, int batch,
int offset) { int input_dim) {
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
// Add input data int cell_dim = input_dim / 5;
net.AddRandomInput<D, float>("Input", shape);
OpDefBuilder("TimeOffset", "TimeOffsetBM") // Add input data
.Input("Input") net.AddRandomInput<D, float>("Input", {batch, input_dim});
.Output("Output") net.AddRandomInput<D, float>("Params",
.AddIntArg("offset", offset) {3, cell_dim},
.Finalize(net.NewOperatorDef()); true);
if (D == DeviceType::CPU) {
OpDefBuilder("LSTMNonlinear", "LSTMNonlinearTest")
.Input("Input")
.Input("Params")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -54,24 +61,24 @@ void TimeOffsetBenchmark(int iters, ...@@ -54,24 +61,24 @@ void TimeOffsetBenchmark(int iters,
} }
} // namespace } // namespace
#define MACE_BM_TIMEOFFSET2D_MACRO(H, W, TYPE, DEVICE) \ #define MACE_BM_LSTM_NONLIN_MACRO(N, IN_DIM, TYPE, DEVICE) \
static void MACE_BM_TIMEOFFSET2D_##H##_##W##_##TYPE##_##DEVICE(\ static void \
int iters) { \ MACE_BM_LSTM_NONLIN_##N##_##IN_DIM##_##TYPE##_##DEVICE(\
const int64_t tot = static_cast<int64_t>(iters) * H * W; \ int iters) { \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ const int64_t tot = \
TimeOffsetBenchmark<DEVICE, TYPE>(iters, {H, W}, 1); \ static_cast<int64_t>(iters) * (N * IN_DIM + 3 * (IN_DIM / 5));\
} \ mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \
MACE_BENCHMARK(MACE_BM_TIMEOFFSET2D_##H##_##W##_##TYPE##_##DEVICE) \ LSTMNonlinear<DEVICE, TYPE>(iters, N, IN_DIM); \
} \
#define MACE_BM_TIMEOFFSET2D(H, W) \ MACE_BENCHMARK( \
MACE_BM_TIMEOFFSET2D_MACRO(H, W, float, CPU); MACE_BM_LSTM_NONLIN_##N##_##IN_DIM##_##TYPE##_##DEVICE)
#define MACE_BM_LSTM_NONLIN(N, IN_DIM) \
MACE_BM_LSTM_NONLIN_MACRO(N, IN_DIM, float, CPU);
MACE_BM_TIMEOFFSET2D(20, 128); MACE_BM_LSTM_NONLIN(50, 200);
MACE_BM_TIMEOFFSET2D(40, 512); MACE_BM_LSTM_NONLIN(50, 920);
MACE_BM_TIMEOFFSET2D(1, 1024); MACE_BM_LSTM_NONLIN(80, 640);
MACE_BM_TIMEOFFSET2D(20, 2048);
MACE_BM_TIMEOFFSET2D(20, 512);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -32,12 +32,16 @@ namespace buffer { ...@@ -32,12 +32,16 @@ namespace buffer {
template <typename T> template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel { class SoftmaxKernel : public OpenCLSoftmaxKernel {
public: public:
explicit SoftmaxKernel(bool use_log)
: use_log_(use_log) {}
MaceStatus Compute( MaceStatus Compute(
OpContext *context, OpContext *context,
const Tensor *logits, const Tensor *logits,
Tensor *output) override; Tensor *output) override;
private: private:
bool use_log_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
...@@ -88,6 +92,7 @@ MaceStatus SoftmaxKernel<T>::Compute( ...@@ -88,6 +92,7 @@ MaceStatus SoftmaxKernel<T>::Compute(
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype())); built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
if (use_log_) built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
......
...@@ -73,13 +73,25 @@ __kernel void softmax(OUT_OF_RANGE_PARAMS ...@@ -73,13 +73,25 @@ __kernel void softmax(OUT_OF_RANGE_PARAMS
switch(exceeded) { switch(exceeded) {
case 1: case 1:
data.z = native_exp(data.z) / sum; data.z = native_exp(data.z) / sum;
#ifdef USE_LOG
data.z = native_log(data.z);
#endif
case 2: case 2:
data.y = native_exp(data.y) / sum; data.y = native_exp(data.y) / sum;
#ifdef USE_LOG
data.y = native_log(data.y);
#endif
case 3: case 3:
data.x = native_exp(data.x) / sum; data.x = native_exp(data.x) / sum;
#ifdef USE_LOG
data.x = native_log(data.x);
#endif
break; break;
default: default:
data = native_exp(data) / sum; data = native_exp(data) / sum;
#ifdef USE_LOG
data = native_log(data);
#endif
} }
WRITE_IMAGET(output, (int2)(pos, hb_idx), data); WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
......
...@@ -75,14 +75,26 @@ __kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS ...@@ -75,14 +75,26 @@ __kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS
switch(remain_chan) { switch(remain_chan) {
case 3: case 3:
output[offset + 2] = native_exp(CONVERT(input[offset + 2]) - max_value) / sum; output[offset + 2] = native_exp(CONVERT(input[offset + 2]) - max_value) / sum;
#ifdef USE_LOG
output[offset + 2] = native_log(output[offset + 2]);
#endif
case 2: case 2:
output[offset + 1] = native_exp(CONVERT(input[offset + 1]) - max_value) / sum; output[offset + 1] = native_exp(CONVERT(input[offset + 1]) - max_value) / sum;
#ifdef USE_LOG
output[offset + 1] = native_log(output[offset + 1]);
#endif
case 1: case 1:
output[offset] = native_exp(CONVERT(input[offset]) - max_value) / sum; output[offset] = native_exp(CONVERT(input[offset]) - max_value) / sum;
#ifdef USE_LOG
output[offset] = native_log(output[offset]);
#endif
} }
} else { } else {
data = CONVERT4(vload4(0, input + offset)); data = CONVERT4(vload4(0, input + offset));
data = native_exp(data - max_value) / sum; data = native_exp(data - max_value) / sum;
#ifdef USE_LOG
data = native_log(data)
#endif
VSTORE4(CONVERT_TO(data, OUT_DATA_TYPE4), output, offset); VSTORE4(CONVERT_TO(data, OUT_DATA_TYPE4), output, offset);
} }
} }
...@@ -59,12 +59,16 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -59,12 +59,16 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
template <typename T> template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel { class SoftmaxKernel : public OpenCLSoftmaxKernel {
public: public:
explicit SoftmaxKernel(bool use_log)
: use_log_(use_log) {}
MaceStatus Compute( MaceStatus Compute(
OpContext *context, OpContext *context,
const Tensor *logits, const Tensor *logits,
Tensor *output) override; Tensor *output) override;
private: private:
bool use_log_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
...@@ -114,6 +118,8 @@ MaceStatus SoftmaxKernel<T>::Compute( ...@@ -114,6 +118,8 @@ MaceStatus SoftmaxKernel<T>::Compute(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (use_log_)
built_options.emplace("-DUSE_LOG");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
......
...@@ -34,6 +34,7 @@ extern void RegisterDeconv2D(OpRegistryBase *op_registry); ...@@ -34,6 +34,7 @@ extern void RegisterDeconv2D(OpRegistryBase *op_registry);
extern void RegisterDepthToSpace(OpRegistryBase *op_registry); extern void RegisterDepthToSpace(OpRegistryBase *op_registry);
extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry); extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry);
extern void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry); extern void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry);
extern void RegisterDynamicLSTM(OpRegistryBase *op_registry);
extern void RegisterEltwise(OpRegistryBase *op_registry); extern void RegisterEltwise(OpRegistryBase *op_registry);
extern void RegisterExpandDims(OpRegistryBase *op_registry); extern void RegisterExpandDims(OpRegistryBase *op_registry);
extern void RegisterFill(OpRegistryBase *op_registry); extern void RegisterFill(OpRegistryBase *op_registry);
...@@ -42,9 +43,11 @@ extern void RegisterGather(OpRegistryBase *op_registry); ...@@ -42,9 +43,11 @@ extern void RegisterGather(OpRegistryBase *op_registry);
extern void RegisterIdentity(OpRegistryBase *op_registry); extern void RegisterIdentity(OpRegistryBase *op_registry);
extern void RegisterInferConv2dShape(OpRegistryBase *op_registry); extern void RegisterInferConv2dShape(OpRegistryBase *op_registry);
extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry); extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry);
extern void RegisterLSTMNonlinear(OpRegistryBase *op_registry);
extern void RegisterMatMul(OpRegistryBase *op_registry); extern void RegisterMatMul(OpRegistryBase *op_registry);
extern void RegisterOneHot(OpRegistryBase *op_registry); extern void RegisterOneHot(OpRegistryBase *op_registry);
extern void RegisterPad(OpRegistryBase *op_registry); extern void RegisterPad(OpRegistryBase *op_registry);
extern void RegisterPadContext(OpRegistryBase *op_registry);
extern void RegisterPNorm(OpRegistryBase *op_registry); extern void RegisterPNorm(OpRegistryBase *op_registry);
extern void RegisterPooling(OpRegistryBase *op_registry); extern void RegisterPooling(OpRegistryBase *op_registry);
extern void RegisterReduce(OpRegistryBase *op_registry); extern void RegisterReduce(OpRegistryBase *op_registry);
...@@ -68,7 +71,6 @@ extern void RegisterStack(OpRegistryBase *op_registry); ...@@ -68,7 +71,6 @@ extern void RegisterStack(OpRegistryBase *op_registry);
extern void RegisterStridedSlice(OpRegistryBase *op_registry); extern void RegisterStridedSlice(OpRegistryBase *op_registry);
extern void RegisterSumGroup(OpRegistryBase *op_registry); extern void RegisterSumGroup(OpRegistryBase *op_registry);
extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry); extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry);
extern void RegisterTimeOffset(OpRegistryBase *op_registry);
extern void RegisterTranspose(OpRegistryBase *op_registry); extern void RegisterTranspose(OpRegistryBase *op_registry);
extern void RegisterUnstack(OpRegistryBase *op_registry); extern void RegisterUnstack(OpRegistryBase *op_registry);
...@@ -102,6 +104,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ...@@ -102,6 +104,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
ops::RegisterDepthToSpace(this); ops::RegisterDepthToSpace(this);
ops::RegisterDepthwiseConv2d(this); ops::RegisterDepthwiseConv2d(this);
ops::RegisterDepthwiseDeconv2d(this); ops::RegisterDepthwiseDeconv2d(this);
ops::RegisterDynamicLSTM(this);
ops::RegisterEltwise(this); ops::RegisterEltwise(this);
ops::RegisterExpandDims(this); ops::RegisterExpandDims(this);
ops::RegisterFill(this); ops::RegisterFill(this);
...@@ -110,9 +113,11 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ...@@ -110,9 +113,11 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
ops::RegisterIdentity(this); ops::RegisterIdentity(this);
ops::RegisterInferConv2dShape(this); ops::RegisterInferConv2dShape(this);
ops::RegisterLocalResponseNorm(this); ops::RegisterLocalResponseNorm(this);
ops::RegisterLSTMNonlinear(this);
ops::RegisterMatMul(this); ops::RegisterMatMul(this);
ops::RegisterOneHot(this); ops::RegisterOneHot(this);
ops::RegisterPad(this); ops::RegisterPad(this);
ops::RegisterPadContext(this);
ops::RegisterPNorm(this); ops::RegisterPNorm(this);
ops::RegisterPooling(this); ops::RegisterPooling(this);
ops::RegisterReduce(this); ops::RegisterReduce(this);
...@@ -136,7 +141,6 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ...@@ -136,7 +141,6 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
ops::RegisterSqueeze(this); ops::RegisterSqueeze(this);
ops::RegisterSumGroup(this); ops::RegisterSumGroup(this);
ops::RegisterTargetRMSNorm(this); ops::RegisterTargetRMSNorm(this);
ops::RegisterTimeOffset(this);
ops::RegisterTranspose(this); ops::RegisterTranspose(this);
ops::RegisterUnstack(this); ops::RegisterUnstack(this);
......
...@@ -25,14 +25,15 @@ namespace mace { ...@@ -25,14 +25,15 @@ namespace mace {
namespace ops { namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class TimeOffsetOp; class PadContextOp;
template <typename T> template <typename T>
class TimeOffsetOp<DeviceType::CPU, T> : public Operation { class PadContextOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit TimeOffsetOp(OpConstructContext *context) explicit PadContextOp(OpConstructContext *context)
: Operation(context), : Operation(context),
offset_(Operation::GetOptionalArg<int>("offset", 0)) {} left_context_(Operation::GetOptionalArg<int>("left_context", 0)),
right_context_(Operation::GetOptionalArg<int>("right_context", 0)) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
...@@ -41,27 +42,38 @@ class TimeOffsetOp<DeviceType::CPU, T> : public Operation { ...@@ -41,27 +42,38 @@ class TimeOffsetOp<DeviceType::CPU, T> : public Operation {
index_t rank = input->dim_size(); index_t rank = input->dim_size();
MACE_CHECK(rank >= 2, "input's rank should >= 2."); MACE_CHECK(rank >= 2, "input's rank should >= 2.");
MACE_CHECK(left_context_ > 0 && right_context_ > 0,
"left context and right context should be greater than zero");
const std::vector<index_t> &input_shape = input->shape(); const std::vector<index_t> &input_shape = input->shape();
const index_t batch = const index_t batch =
std::accumulate(input_shape.begin(), input_shape.end() - 2, 1, std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
const index_t frames = input_shape[rank - 2]; const index_t chunk = input_shape[rank - 2];
const index_t input_dim = input_shape[rank - 1]; const index_t dim = input_shape[rank - 1];
MACE_RETURN_IF_ERROR(output->ResizeLike(input)); const index_t output_chunk = chunk + left_context_ + right_context_;
std::vector<index_t> output_shape = input->shape();
output_shape[rank - 2] = output_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const T *input_data = input->data<T>(); const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t i = 0; i < batch; ++i) { for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < frames; ++j) { T *out_base = output_data + i * output_chunk * dim;
index_t time_index = offset_ + j; const T *in_base = input_data + i * chunk * dim;
index_t index = Clamp<index_t>(time_index, 0, frames - 1); #pragma omp parallel for schedule(runtime)
T *output_base = output_data + (i * frames + j) * input_dim; for (index_t j = 0; j < left_context_; ++j) {
const T *input_base = input_data + (i * frames + index) * input_dim; memcpy(out_base + j * dim, in_base, dim * sizeof(T));
memcpy(output_base, input_base, input_dim * sizeof(T)); }
out_base = out_base + left_context_ * dim;
memcpy(out_base, in_base, chunk * dim * sizeof(T));
out_base = out_base + chunk * dim;
in_base = in_base + (chunk -1) * dim;
#pragma omp parallel for schedule(runtime)
for (index_t j = 0; j < right_context_; ++j) {
memcpy(out_base + j * dim, in_base, dim * sizeof(T));
} }
} }
...@@ -69,11 +81,12 @@ class TimeOffsetOp<DeviceType::CPU, T> : public Operation { ...@@ -69,11 +81,12 @@ class TimeOffsetOp<DeviceType::CPU, T> : public Operation {
} }
private: private:
int offset_; int left_context_;
int right_context_;
}; };
void RegisterTimeOffset(OpRegistryBase *op_registry) { void RegisterPadContext(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "TimeOffset", TimeOffsetOp, MACE_REGISTER_OP(op_registry, "PadContext", PadContextOp,
DeviceType::CPU, float); DeviceType::CPU, float);
} }
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
namespace {
template<DeviceType D, typename T>
void PadContextBM(int iters,
const std::vector<index_t> &input_shape,
const int left_context,
const int right_context) {
mace::testing::StopTiming();
// Construct graph
OpsTestNet net;
net.AddRandomInput<D, float>("Input", input_shape);
OpDefBuilder("PadContext", "PadContextBM")
.Input("Input")
.Output("Output")
.AddIntArg("left_context", left_context)
.AddIntArg("right_context", right_context)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
net.Sync();
}
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Sync();
}
}
} // namespace
#define MACE_BM_PAD_CONTEXT_MACRO(N, H, W, L, R, TYPE, DEVICE) \
static void \
MACE_BM_PAD_CONTEXT_##N##_##H##_##W##_##L##_##R##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PadContextBM<DEVICE, TYPE>(iters, {N, H, W}, L, R); \
} \
MACE_BENCHMARK( \
MACE_BM_PAD_CONTEXT_##N##_##H##_##W##_##L##_##R##_##TYPE##_##DEVICE)
#define MACE_BM_PAD_CONTEXT(N, H, W, L, R) \
MACE_BM_PAD_CONTEXT_MACRO(N, H, W, L, R, float, CPU);
MACE_BM_PAD_CONTEXT(1, 32, 32, 5, 5);
MACE_BM_PAD_CONTEXT(2, 32, 32, 7, 7);
MACE_BM_PAD_CONTEXT(1, 32, 32, 3, 3);
MACE_BM_PAD_CONTEXT(1, 128, 128, 9, 9);
MACE_BM_PAD_CONTEXT(3, 128, 128, 7, 7);
} // namespace test
} // namespace ops
} // namespace mace
...@@ -18,106 +18,68 @@ namespace mace { ...@@ -18,106 +18,68 @@ namespace mace {
namespace ops { namespace ops {
namespace test { namespace test {
class TimeOffsetOpTest : public OpsTestBase {}; class PadContextOpTest : public OpsTestBase {};
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestTimeOffset(const std::vector<index_t> &input_shape, void TestPadContext(const std::vector<index_t> &input_shape,
const std::vector<T> &input, const std::vector<T> &input,
const int offset, const int left_context,
const std::vector<T> &output) { const int right_context,
const std::vector<index_t> &output_shape,
const std::vector<T> &output) {
OpsTestNet net; OpsTestNet net;
net.AddInputFromArray<CPU, T>(MakeString("Input"), net.AddInputFromArray<CPU, T>(MakeString("Input"),
input_shape, input_shape,
input); input);
OpDefBuilder("TimeOffset", "TimeOffsetTest") OpDefBuilder("PadContext", "PadContextTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("offset", offset) .AddIntArg("left_context", left_context)
.AddIntArg("right_context", right_context)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(); net.RunOp();
net.AddInputFromArray<CPU, T>("ExpectedOutput", input_shape, output); auto expected = net.CreateTensor<T>(output_shape, output);
ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"), ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
*net.GetOutput("Output"));
} }
} // namespace } // namespace
TEST_F(TimeOffsetOpTest, Simple2Dim) { TEST_F(PadContextOpTest, Simple2Dim) {
TestTimeOffset<DeviceType::CPU, float>( TestPadContext<DeviceType::CPU, float>(
{3, 5}, {3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-2, 2, 3, {8, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); {1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
TestTimeOffset<DeviceType::CPU, float>( 1, 2, 3, 4, 5,
{3, 5}, 6, 7, 8, 9, 10,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 11, 12, 13, 14, 15,
-1, 11, 12, 13, 14, 15,
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); 11, 12, 13, 14, 15,
11, 12, 13, 14, 15});
TestTimeOffset<DeviceType::CPU, float>(
{3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
0,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
TestTimeOffset<DeviceType::CPU, float>(
{3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
1,
{6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
TestTimeOffset<DeviceType::CPU, float>(
{3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
2,
{11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
} }
TEST_F(PadContextOpTest, Simple3Dim) {
TEST_F(TimeOffsetOpTest, Simple3Dim) { TestPadContext<DeviceType::CPU, float>(
TestTimeOffset<DeviceType::CPU, float>(
{2, 3, 5}, {2, 3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-2, 1, 2, {2, 6, 5},
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, {1, 2, 3, 4, 5,
1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); 1, 2, 3, 4, 5,
6, 7, 8, 9, 10,
TestTimeOffset<DeviceType::CPU, float>( 11, 12, 13, 14, 15,
{2, 3, 5}, 11, 12, 13, 14, 15,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 1, 2, 3, 4, 5,
-1, 1, 2, 3, 4, 5,
{1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10,
1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); 11, 12, 13, 14, 15,
11, 12, 13, 14, 15,
TestTimeOffset<DeviceType::CPU, float>( 11, 12, 13, 14, 15});
{2, 3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
0,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
TestTimeOffset<DeviceType::CPU, float>(
{2, 3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
1,
{6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
TestTimeOffset<DeviceType::CPU, float>(
{2, 3, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
2,
{11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15,
11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
} }
} // namespace test } // namespace test
......
...@@ -48,8 +48,6 @@ class PNormOp<DeviceType::CPU, T> : public Operation { ...@@ -48,8 +48,6 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
const std::vector<index_t> &input_shape = input->shape(); const std::vector<index_t> &input_shape = input->shape();
const index_t dim_size = input_shape.size(); const index_t dim_size = input_shape.size();
MACE_CHECK(dim_size >= 1, "PNorm only supports input dim size >= 1"); MACE_CHECK(dim_size >= 1, "PNorm only supports input dim size >= 1");
......
...@@ -57,7 +57,7 @@ void PNormBenchmark(int iters, int n, int h, int w, int p, int ow) { ...@@ -57,7 +57,7 @@ void PNormBenchmark(int iters, int n, int h, int w, int p, int ow) {
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PNormBenchmark<DEVICE, TYPE>(iters, N, H, W, P, OW); \ PNormBenchmark<DEVICE, TYPE>(iters, N, H, W, P, OW); \
} \ } \
MACE_BENCHMARK( \ MACE_BENCHMARK( \
MACE_BM_PNORM_##N##_##H##_##W##_##P##_##OW##_##TYPE##_##DEVICE) MACE_BM_PNORM_##N##_##H##_##W##_##P##_##OW##_##TYPE##_##DEVICE)
......
...@@ -40,19 +40,18 @@ class SliceOp<DeviceType::CPU, T> : public Operation { ...@@ -40,19 +40,18 @@ class SliceOp<DeviceType::CPU, T> : public Operation {
const index_t rank = input->dim_size(); const index_t rank = input->dim_size();
MACE_CHECK(rank >= 1) MACE_CHECK(rank >= 1)
<< "The input dim size should >= 1"; << "The input dim size should >= 1";
const index_t input_dim = input->dim(rank - 1);
MACE_CHECK(starts_.size() == 1 && ends_.size() == 1 && axes_.size() == 1, MACE_CHECK(starts_.size() == 1 && ends_.size() == 1 && axes_.size() == 1,
"only support slicing at one axis."); "only support slicing at one axis.");
MACE_CHECK(axes_[0] == -1 || axes_[0] == rank - 1, MACE_CHECK(axes_[0] == -1 || axes_[0] == rank - 1,
"only support slicing at the last axis."); "only support slicing at the last axis.");
const index_t input_dim = input->dim(rank - 1); MACE_CHECK(starts_[0] < input_dim && starts_[0] >= 0
&& ends_[0] >= 0
&& ends_[0] <= input_dim)
<< "The starts and ends caused over range error.";
const index_t offset = starts_[0]; const index_t offset = starts_[0];
const index_t output_dim = ends_[0] - starts_[0]; const index_t output_dim = ends_[0] - starts_[0];
MACE_CHECK(output_dim >= 0, "output_dim should >= 0"); MACE_CHECK(output_dim >= 0, "output_dim should >= 0");
MACE_CHECK(starts_[0] < input_dim
&& output_dim <= input_dim
&& ends_[0] <= input_dim)
<< "The starts and ends caused over range error.";
const index_t frames = const index_t frames =
std::accumulate(input->shape().begin(), input->shape().end() - 1, 1, std::accumulate(input->shape().begin(), input->shape().end() - 1, 1,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
namespace {
template<DeviceType D, typename T>
void BMSliceHelper(int iters,
const std::vector<index_t> &input_shape,
const int offset,
const int output_dim) {
mace::testing::StopTiming();
// Construct graph
OpsTestNet net;
net.AddRandomInput<D, float>("Input", input_shape);
OpDefBuilder("Slice", "SliceBM")
.Input("Input")
.Output("Output")
.AddIntArg("offset", offset)
.AddIntArg("output_dim", output_dim)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
net.Sync();
}
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Sync();
}
}
} // namespace
#define MACE_BM_SLICE_MACRO(N, H, W, S, D, TYPE, DEVICE) \
static void \
MACE_BM_SLICE_##N##_##H##_##W##_##S##_##D##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSliceHelper<DEVICE, TYPE>(iters, {N, H, W}, S, D); \
} \
MACE_BENCHMARK( \
MACE_BM_SLICE_##N##_##H##_##W##_##S##_##D##_##TYPE##_##DEVICE)
#define MACE_BM_SLICE(N, H, W, S, D) \
MACE_BM_SLICE_MACRO(N, H, W, S, D, float, CPU);
MACE_BM_SLICE(1, 32, 32, 5, 5);
MACE_BM_SLICE(1, 32, 32, 7, 5);
MACE_BM_SLICE(1, 32, 32, 3, 20);
MACE_BM_SLICE(1, 128, 128, 9, 100);
MACE_BM_SLICE(1, 128, 128, 7, 100);
} // namespace test
} // namespace ops
} // namespace mace
...@@ -42,7 +42,8 @@ template <> ...@@ -42,7 +42,8 @@ template <>
class SoftmaxOp<DeviceType::CPU, float> : public Operation { class SoftmaxOp<DeviceType::CPU, float> : public Operation {
public: public:
explicit SoftmaxOp(OpConstructContext *context) explicit SoftmaxOp(OpConstructContext *context)
: Operation(context) {} : Operation(context),
use_log_(Operation::GetOptionalArg<bool>("use_log", false)) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
...@@ -88,9 +89,18 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -88,9 +89,18 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
sum = std::max(sum, std::numeric_limits<float>::min()); sum = std::max(sum, std::numeric_limits<float>::min());
channel_offset = 0; channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) { if (use_log_) {
output_ptr[channel_offset] /= sum; for (index_t c = 0; c < class_count; ++c) {
channel_offset += class_size; output_ptr[channel_offset] /= sum;
output_ptr[channel_offset] =
std::log(output_ptr[channel_offset]);
channel_offset += class_size;
}
} else {
for (index_t c = 0; c < class_count; ++c) {
output_ptr[channel_offset] /= sum;
channel_offset += class_size;
}
} }
} // k } // k
} // b } // b
...@@ -123,8 +133,15 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -123,8 +133,15 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
} }
sum = std::max(sum, std::numeric_limits<float>::min()); sum = std::max(sum, std::numeric_limits<float>::min());
for (index_t c = 0; c < class_count; ++c) { if (use_log_) {
output_ptr[c] /= sum; for (index_t c = 0; c < class_count; ++c) {
output_ptr[c] /= sum;
output_ptr[c] = std::log(output_ptr[c]);
}
} else {
for (index_t c = 0; c < class_count; ++c) {
output_ptr[c] /= sum;
}
} }
} }
} else { } else {
...@@ -132,6 +149,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -132,6 +149,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
protected:
bool use_log_;
}; };
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
...@@ -142,10 +162,12 @@ template <> ...@@ -142,10 +162,12 @@ template <>
class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation { class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
public: public:
explicit SoftmaxOp(OpConstructContext *context) explicit SoftmaxOp(OpConstructContext *context)
: Operation(context) {} : Operation(context),
use_log_(Operation::GetOptionalArg<bool>("use_log", false)) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
MACE_CHECK(!use_log_, "MACE dose not support quantized logsoftmax yet.");
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
MACE_RETURN_IF_ERROR(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
...@@ -366,6 +388,9 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -366,6 +388,9 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
protected:
bool use_log_;
}; };
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
...@@ -375,11 +400,13 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation { ...@@ -375,11 +400,13 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
public: public:
explicit SoftmaxOp(OpConstructContext *context) explicit SoftmaxOp(OpConstructContext *context)
: Operation(context) { : Operation(context) {
bool use_log = (
Operation::GetOptionalArg<bool>("use_log", false));
if (context->device()->gpu_runtime()->UseImageMemory()) { if (context->device()->gpu_runtime()->UseImageMemory()) {
kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(); kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
} else { } else {
context->set_output_mem_type(MemoryType::GPU_BUFFER); context->set_output_mem_type(MemoryType::GPU_BUFFER);
kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(); kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
} }
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
......
...@@ -12,6 +12,13 @@ ...@@ -12,6 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// python implementation
// import numpy as np
// x = np.asarray([1., 1., 1., 1.], 'f')
// exp_x = np.exp(x)
// softmax_x = exp_x / np.sum(exp_x)
// log_softmax_x = np.log(softmax_x)
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
...@@ -19,18 +26,27 @@ namespace ops { ...@@ -19,18 +26,27 @@ namespace ops {
namespace test { namespace test {
class SoftmaxOpTest : public OpsTestBase {}; class SoftmaxOpTest : public OpsTestBase {};
class LogSoftmaxOpTest : public OpsTestBase {};
namespace { namespace {
template <DeviceType D> template <DeviceType D>
void Simple() { void Simple(bool use_log = false) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4}, net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
{1, 1, 1, 1, 1, 2, 3, 4}); {1, 1, 1, 1, 1, 2, 3, 4});
std::vector<float_t> expected_data(8);
if (use_log) {
expected_data = {-1.3862944, -1.3862944, -1.3862944, -1.3862944,
-3.4401896 , -2.4401896 , -1.4401897 , -0.44018975};
} else {
expected_data = {0.25, 0.25, 0.25, 0.25,
0.0320586, 0.08714432, 0.23688282, 0.6439142};
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 1, 2, 4}, {1, 1, 2, 4}, expected_data);
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
// test 4d softmax // test 4d softmax
...@@ -38,6 +54,7 @@ void Simple() { ...@@ -38,6 +54,7 @@ void Simple() {
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("use_log", static_cast<int>(use_log))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -52,6 +69,7 @@ void Simple() { ...@@ -52,6 +69,7 @@ void Simple() {
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input2d") .Input("Input2d")
.Output("Output") .Output("Output")
.AddIntArg("use_log", static_cast<int>(use_log))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -62,6 +80,7 @@ void Simple() { ...@@ -62,6 +80,7 @@ void Simple() {
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("use_log", static_cast<int>(use_log))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -77,9 +96,13 @@ void Simple() { ...@@ -77,9 +96,13 @@ void Simple() {
TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); } TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(); } TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(); }
TEST_F(LogSoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(true); }
TEST_F(LogSoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(true); }
namespace { namespace {
template <DeviceType D> template <DeviceType D>
void Complex(const std::vector<index_t> &logits_shape) { void Complex(const std::vector<index_t> &logits_shape,
bool use_log = false) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
...@@ -91,11 +114,13 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -91,11 +114,13 @@ void Complex(const std::vector<index_t> &logits_shape) {
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("use_log", static_cast<int>(use_log))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("use_log", static_cast<int>(use_log))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} }
// Run on cpu // Run on cpu
...@@ -111,6 +136,7 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -111,6 +136,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("use_log", static_cast<int>(use_log))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on gpu // Run on gpu
...@@ -140,6 +166,26 @@ TEST_F(SoftmaxOpTest, OPENCLAlignedRank2) { ...@@ -140,6 +166,26 @@ TEST_F(SoftmaxOpTest, OPENCLAlignedRank2) {
Complex<DeviceType::GPU>({3, 1001}); Complex<DeviceType::GPU>({3, 1001});
} }
TEST_F(LogSoftmaxOpTest, OPENCLAligned) {
Complex<DeviceType::GPU>({1, 256, 256, 3}, true);
Complex<DeviceType::GPU>({1, 128, 128, 16}, true);
}
TEST_F(LogSoftmaxOpTest, OPENCLMulBatchAligned) {
Complex<DeviceType::GPU>({5, 64, 64, 3}, true);
Complex<DeviceType::GPU>({8, 128, 128, 8}, true);
}
TEST_F(LogSoftmaxOpTest, OPENCLUnAligned) {
Complex<DeviceType::GPU>({1, 113, 107, 13}, true);
Complex<DeviceType::GPU>({5, 211, 107, 1}, true);
}
TEST_F(LogSoftmaxOpTest, OPENCLAlignedRank2) {
Complex<DeviceType::GPU>({1, 1001}, true);
Complex<DeviceType::GPU>({3, 1001}, true);
}
namespace { namespace {
void TestQuantizedSoftmax(const std::vector<index_t> &input_shape) { void TestQuantizedSoftmax(const std::vector<index_t> &input_shape) {
......
...@@ -14,16 +14,11 @@ ...@@ -14,16 +14,11 @@
// This Op is for SpliceComponent in Kaldi. // This Op is for SpliceComponent in Kaldi.
// It splices a context window of frames together [over time] // It splices a context window of frames together [over time]
// (copy and append the frame whose time-index in in context_) // (copy and append the frame whose time-index is in context_)
// The context_ values indicate which frame (over time) to splice. // The context_ values indicate which frame (over time) to splice.
// if context value is less than the first time-index, // It will reduce frames because of left context and right context.
// copy and append the first frame's dada, // i.e., give input data with shape {20, 40}, and contexts:{-2, -1, 0, 1, 2},
// when context value is larger than frame's count, // the output shape should be {16, 200}
// copy and append the last frame's data.
// i.e., give input data: [[1, 2, 3], [4, 5, 6]],
// with input-dim = 3, frame count = 2, context = [-1, 0, 1]
// Then, the output should be:
// [1, 2, 3, 1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6, 4, 5, 6]
// if const_component_dim_ != 0, const_dim_ will be used to determine which // if const_component_dim_ != 0, const_dim_ will be used to determine which
// row of "in" we copy the last part of each row of "out" from (this part is // row of "in" we copy the last part of each row of "out" from (this part is
// not subject to splicing, it's assumed constant for each frame of "input". // not subject to splicing, it's assumed constant for each frame of "input".
...@@ -54,24 +49,34 @@ class SpliceOp<DeviceType::CPU, T> : public Operation { ...@@ -54,24 +49,34 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
MACE_CHECK(context_.size() > 0) MACE_CHECK(context_.size() > 0)
<< "The context param should not be empty in Splice Op."; << "The context param should not be empty in Splice Op.";
MACE_CHECK(input->dim_size() >= 2)
<< "Splice's input's rank should be greater than 2.";
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
const std::vector<index_t> &input_shape = input->shape(); const std::vector<index_t> &input_shape = input->shape();
const index_t frames = const index_t batch =
std::accumulate(input->shape().begin(), input->shape().end() - 1, 1, std::accumulate(input->shape().begin(), input->shape().end() - 2, 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
const index_t rank = input->dim_size(); const index_t rank = input->dim_size();
const index_t chunk = input_shape[rank - 2];
const index_t input_dim = input_shape[rank - 1]; const index_t input_dim = input_shape[rank - 1];
const index_t input_stride = chunk * input_dim;
const index_t num_splice = static_cast<index_t>(context_.size()); const index_t num_splice = static_cast<index_t>(context_.size());
const index_t dim = input_dim - const_dim_; const index_t dim = input_dim - const_dim_;
const index_t left_context = context_[0];
const index_t right_context = context_[num_splice -1];
const index_t out_chunk = chunk - (right_context - left_context);
MACE_CHECK(input_dim > const_dim_, MACE_CHECK(input_dim > const_dim_,
"input dim should be greater than const dim."); "input dim should be greater than const dim.");
const index_t output_dim = dim * num_splice + const_dim_; const index_t output_dim = dim * num_splice + const_dim_;
const index_t output_stride = out_chunk * output_dim;
std::vector<index_t> output_shape = input->shape(); std::vector<index_t> output_shape = input->shape();
output_shape[rank - 2] = out_chunk;
output_shape[rank - 1] = output_dim; output_shape[rank - 1] = output_dim;
MACE_RETURN_IF_ERROR(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
...@@ -80,28 +85,32 @@ class SpliceOp<DeviceType::CPU, T> : public Operation { ...@@ -80,28 +85,32 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
const T *input_data = input->data<T>(); const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
#pragma omp parallel for collapse(2) schedule(runtime) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t i = 0; i < frames; ++i) { for (int b = 0; b < batch; ++b) {
for (index_t i = 0; i < out_chunk; ++i) {
for (index_t c = 0; c < num_splice; ++c) { for (index_t c = 0; c < num_splice; ++c) {
const index_t offset = const index_t offset = i + context_[c] - left_context;
Clamp<index_t>(context_[c] + i, 0, frames - 1); T *output_base =
T *output_base = output_data + i * output_dim + c * dim; output_data + b * output_stride + i * output_dim + c * dim;
const T *input_base = input_data + offset * input_dim; const T *input_base =
input_data + b * input_stride + offset * input_dim;
memcpy(output_base, input_base, dim * sizeof(T)); memcpy(output_base, input_base, dim * sizeof(T));
} }
} }
}
if (const_dim_ > 0) { if (const_dim_ > 0) {
const index_t output_offset = output_dim - const_dim_; const index_t output_offset = output_dim - const_dim_;
const index_t input_offset = dim; const index_t input_offset = dim;
#pragma omp parallel for schedule(runtime) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t i = 0; i < frames; ++i) { for (int b = 0; b < batch; ++b) {
index_t offset = i + context_[0] >= 0 ? i + context_[0] : 0; for (index_t i = 0; i < out_chunk; ++i) {
T *output_base = output_data + i * output_dim; T *output_base = output_data + + b * output_stride + i * output_dim;
const T *input_base = input_data + offset * input_dim; const T *input_base = input_data + b * input_stride + i * input_dim;
memcpy(output_base + output_offset, memcpy(output_base + output_offset,
input_base + input_offset, input_base + input_offset,
const_dim_ * sizeof(T)); const_dim_ * sizeof(T));
}
} }
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
......
...@@ -23,15 +23,15 @@ namespace { ...@@ -23,15 +23,15 @@ namespace {
template<DeviceType D, typename T> template<DeviceType D, typename T>
void BMSpliceHelper(int iters, void BMSpliceHelper(int iters,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const index_t left_context, const int left_context,
const index_t right_context, const int right_context,
const int const_component_dim) { const int const_component_dim) {
mace::testing::StopTiming(); mace::testing::StopTiming();
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
const int num_splice = left_context + right_context + 1; const index_t num_splice = left_context + right_context + 1;
std::vector<int> contexts(num_splice); std::vector<int> contexts(num_splice);
for (int i = 0; i < num_splice; ++i) { for (int i = 0; i < num_splice; ++i) {
contexts[i] = left_context + i; contexts[i] = left_context + i;
...@@ -44,7 +44,7 @@ void BMSpliceHelper(int iters, ...@@ -44,7 +44,7 @@ void BMSpliceHelper(int iters,
GenerateRandomRealTypeData(input_shape, &input_data); GenerateRandomRealTypeData(input_shape, &input_data);
net.AddInputFromArray<D, float>("Input", input_shape, input_data); net.AddInputFromArray<D, float>("Input", input_shape, input_data);
OpDefBuilder("Splice", "SpliceTest") OpDefBuilder("Splice", "SpliceBM")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntsArg("context", contexts) .AddIntsArg("context", contexts)
...@@ -71,7 +71,6 @@ void BMSpliceHelper(int iters, ...@@ -71,7 +71,6 @@ void BMSpliceHelper(int iters,
MACE_BM_SPLICE_##N##_##H##_##W##_##L##_##R##_##C##_##TYPE##_##DEVICE( \ MACE_BM_SPLICE_##N##_##H##_##W##_##L##_##R##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \
mace::testing::MacsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpliceHelper<DEVICE, TYPE>(iters, {N, H, W}, L, R, C); \ BMSpliceHelper<DEVICE, TYPE>(iters, {N, H, W}, L, R, C); \
} \ } \
......
...@@ -53,14 +53,10 @@ TEST_F(SpliceOpTest, WithoutConstDim) { ...@@ -53,14 +53,10 @@ TEST_F(SpliceOpTest, WithoutConstDim) {
{1, 7, 2}, {1, 7, 2},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
{-2, -1, 0, 1, 2}, 0, {-2, -1, 0, 1, 2}, 0,
{1, 7, 10}, {1, 3, 10},
{1, 2, 1, 2, 1, 2, 3, 4, 5, 6, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1, 2, 1, 2, 3, 4, 5, 6, 7, 8,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
7, 8, 9, 10, 11, 12, 13, 14, 13, 14,
9, 10, 11, 12, 13, 14, 13, 14, 13, 14});
} }
TEST_F(SpliceOpTest, WithConstDim) { TEST_F(SpliceOpTest, WithConstDim) {
...@@ -72,12 +68,8 @@ TEST_F(SpliceOpTest, WithConstDim) { ...@@ -72,12 +68,8 @@ TEST_F(SpliceOpTest, WithConstDim) {
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
{-2, -1, 0, 1, 2}, 7, {-2, -1, 0, 1, 2}, 7,
{1, 5, 22}, {1, 1, 22},
{1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10, {1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10});
1, 2, 3, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 4, 5, 6, 7, 8, 9, 10,
1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10,
2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 5, 6, 7, 5, 6, 7, 8, 9, 10, 11,
3, 4, 5, 4, 5, 6, 5, 6, 7, 5, 6, 7, 5, 6, 7, 6, 7, 8, 9, 10, 11, 12});
} }
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -102,7 +102,6 @@ class FrameworkType(Enum): ...@@ -102,7 +102,6 @@ class FrameworkType(Enum):
MaceSupportedOps = [ MaceSupportedOps = [
'Activation', 'Activation',
'AddN', 'AddN',
'Affine',
'ArgMax', 'ArgMax',
'BatchNorm', 'BatchNorm',
'BatchToSpaceND', 'BatchToSpaceND',
...@@ -126,10 +125,12 @@ MaceSupportedOps = [ ...@@ -126,10 +125,12 @@ MaceSupportedOps = [
'InferConv2dShape', 'InferConv2dShape',
'LocalResponseNorm', 'LocalResponseNorm',
'LSTMCell', 'LSTMCell',
# 'LstmNonlinear', 'LstmNonlinear',
'DynamicLSTM',
'MatMul', 'MatMul',
'OneHot', 'OneHot',
'Pad', 'Pad',
'PadContext',
'PNorm', 'PNorm',
'Pooling', 'Pooling',
'PriorBox', 'PriorBox',
...@@ -156,7 +157,6 @@ MaceSupportedOps = [ ...@@ -156,7 +157,6 @@ MaceSupportedOps = [
'SqrDiffMean', 'SqrDiffMean',
'SumGroup', 'SumGroup',
'TargetRMSNorm', 'TargetRMSNorm',
'TimeOffset',
'Transpose', 'Transpose',
'WinogradInverseTransform', 'WinogradInverseTransform',
'WinogradTransform', 'WinogradTransform',
......
...@@ -72,6 +72,7 @@ OnnxSupportedOps = [ ...@@ -72,6 +72,7 @@ OnnxSupportedOps = [
'DimRange', 'DimRange',
'Div', 'Div',
'Dropout', 'Dropout',
'DynamicLstmCell',
'Elu', 'Elu',
'Equal', 'Equal',
# 'Exp', # 'Exp',
...@@ -90,16 +91,16 @@ OnnxSupportedOps = [ ...@@ -90,16 +91,16 @@ OnnxSupportedOps = [
# 'Hardmax', # 'Hardmax',
'Identity', 'Identity',
# 'If', # 'If',
'IfDefined', # 'IfDefined',
'ImageScaler', 'ImageScaler',
# 'InstanceNormalization', # 'InstanceNormalization',
# 'LRN', # 'LRN',
'LSTM', 'LSTM',
# 'LstmNonlinear', 'LstmNonlinear',
'LeakyRelu', 'LeakyRelu',
# 'Less', # 'Less',
# 'Log', # 'Log',
# 'LogSoftmax', 'LogSoftmax',
# 'Loop', # 'Loop',
# 'LpNormalization', # 'LpNormalization',
# 'LpPool', # 'LpPool',
...@@ -120,6 +121,7 @@ OnnxSupportedOps = [ ...@@ -120,6 +121,7 @@ OnnxSupportedOps = [
# 'Or', # 'Or',
'PRelu', 'PRelu',
# 'Pad', # 'Pad',
'PadContext',
'Padding', 'Padding',
'PNorm', 'PNorm',
'Pow', 'Pow',
...@@ -331,12 +333,11 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -331,12 +333,11 @@ class OnnxConverter(base_converter.ConverterInterface):
OnnxOpType.GlobalAveragePool.name: self.convert_reduce, OnnxOpType.GlobalAveragePool.name: self.convert_reduce,
OnnxOpType.GlobalMaxPool.name: self.convert_reduce, OnnxOpType.GlobalMaxPool.name: self.convert_reduce,
OnnxOpType.Identity.name: self.convert_identity, OnnxOpType.Identity.name: self.convert_identity,
OnnxOpType.IfDefined.name: self.convert_identity,
OnnxOpType.ImageScaler.name: self.convert_imagescaler, OnnxOpType.ImageScaler.name: self.convert_imagescaler,
OnnxOpType.LeakyRelu.name: self.convert_activation, OnnxOpType.LeakyRelu.name: self.convert_activation,
# OnnxOpType.LogSoftmax.name: self.convert_softmax, OnnxOpType.LogSoftmax.name: self.convert_softmax,
OnnxOpType.LSTM.name: self.convert_lstm, OnnxOpType.LstmNonlinear.name: self.convert_lstm_nonlinear,
# OnnxOpType.LstmNonlinear.name: self.convert_lstm_nonlinear, OnnxOpType.DynamicLstmCell.name: self.convert_dynamic_lstm,
OnnxOpType.Max.name: self.convert_eltwise, OnnxOpType.Max.name: self.convert_eltwise,
OnnxOpType.MaxPool.name: self.convert_pooling, OnnxOpType.MaxPool.name: self.convert_pooling,
OnnxOpType.MatMul.name: self.convert_matmul, OnnxOpType.MatMul.name: self.convert_matmul,
...@@ -344,7 +345,8 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -344,7 +345,8 @@ class OnnxConverter(base_converter.ConverterInterface):
OnnxOpType.Mul.name: self.convert_eltwise, OnnxOpType.Mul.name: self.convert_eltwise,
OnnxOpType.Neg.name: self.convert_eltwise, OnnxOpType.Neg.name: self.convert_eltwise,
OnnxOpType.Normalize: self.convert_normalize, OnnxOpType.Normalize: self.convert_normalize,
OnnxOpType.Offset.name: self.convert_timeoffset, OnnxOpType.Offset.name: self.convert_identity,
OnnxOpType.PadContext.name: self.convert_pad_context,
OnnxOpType.Padding.name: self.convert_identity, OnnxOpType.Padding.name: self.convert_identity,
OnnxOpType.PNorm.name: self.convert_pnorm, OnnxOpType.PNorm.name: self.convert_pnorm,
OnnxOpType.Pow.name: self.convert_eltwise, OnnxOpType.Pow.name: self.convert_eltwise,
...@@ -642,7 +644,7 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -642,7 +644,7 @@ class OnnxConverter(base_converter.ConverterInterface):
mace_check(axis_value == 1 or axis_value == -3, mace_check(axis_value == 1 or axis_value == -3,
"only support concat at channel dimension") "only support concat at channel dimension")
elif node.op_type == OnnxOpType.Append.name: elif node.op_type == OnnxOpType.Append.name:
axis_value = 2 axis_value = 1
axis_arg = op.arg.add() axis_arg = op.arg.add()
axis_arg.name = MaceKeyword.mace_axis_str axis_arg.name = MaceKeyword.mace_axis_str
axis_arg.i = 4 + axis_value if axis_value < 0 else axis_value axis_arg.i = 4 + axis_value if axis_value < 0 else axis_value
...@@ -758,14 +760,69 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -758,14 +760,69 @@ class OnnxConverter(base_converter.ConverterInterface):
offset = node.attrs['offset'] offset = node.attrs['offset']
starts_arg = op.arg.add() starts_arg = op.arg.add()
starts_arg.name = 'starts' starts_arg.name = 'starts'
starts_arg.ints.append(offset) starts_arg.ints.extend([offset])
output_dim = node.attrs['output_dim'] output_dim = node.attrs['output_dim']
ends_arg = op.arg.add() ends_arg = op.arg.add()
ends_arg.name = 'output_dim' ends_arg.name = 'ends'
ends_arg.ints.append(output_dim) ends_arg.ints.extend([output_dim + offset])
axes_arg = op.arg.add() axes_arg = op.arg.add()
axes_arg.name = 'axes' axes_arg.name = 'axes'
axes_arg.ints.append(-1) axes_arg.ints.extend([-1])
def convert_dynamic_lstm(self, node):
op = self.convert_general_op(node)
op.type = MaceOp.DynamicLSTM.name
if 'delay_a' in node.attrs:
prev_out_delay = node.attrs['delay_a']
mace_check(prev_out_delay < 0,
"dynamic's prev_out_delay should <= 0.")
prev_out_delay_arg = op.arg.add()
prev_out_delay_arg.name = 'prev_out_delay'
prev_out_delay_arg.i = prev_out_delay
if 'delay_b' in node.attrs:
prev_cell_delay = node.attrs['delay_b']
mace_check(prev_cell_delay < 0,
"dynamic's prev_cell_delay should < 0.")
prev_cell_delay_arg = op.arg.add()
prev_cell_delay_arg.name = 'prev_cell_delay'
prev_cell_delay_arg.i = prev_cell_delay
if 'prev_out_offset' in node.attrs:
prev_out_offset = node.attrs['prev_out_offset']
mace_check(pre_out_offset >= 0,
"dynamic's prev_out_offset should >= 0.")
prev_out_offset_arg = op.arg.add()
prev_out_offset_arg.name = 'prev_out_offset'
prev_out_offset_arg.i = prev_out_offset
if 'prev_a_dim' in node.attrs:
prev_out_dim = node.attrs['prev_a_dim']
mace_check(prev_out_dim > 0,
"dynamic's prev_out_dim should > 0.")
prev_out_dim_arg = op.arg.add()
prev_out_dim_arg.name = 'prev_out_dim'
prev_out_dim_arg.i = prev_out_dim
if 'prev_b_dim' in node.attrs:
prev_cell_dim = node.attrs['prev_b_dim']
mace_check(prev_cell_dim > 0,
"dynamic's prev_cell_dim should > 0.")
prev_cell_dim_arg = op.arg.add()
prev_cell_dim_arg.name = 'prev_cell_dim'
prev_cell_dim_arg.i = prev_cell_dim
if 'bias_a' in node.attrs:
bias_a = node.attrs['bias_a']
bias_a_arg = op.arg.add()
bias_a_arg.name = 'bias_a'
bias_a_arg.i = bias_a
if 'bias_b' in node.attrs:
bias_b = node.attrs['bias_b']
bias_b_arg = op.arg.add()
bias_b_arg.name = 'bias_b'
bias_b_arg.i = bias_b
if 'scale' in node.attrs:
scale = node.attrs['scale']
scale_arg = op.arg.add()
scale_arg.name = 'scale'
scale_arg.f = scale
def convert_eltwise(self, node): def convert_eltwise(self, node):
op = self.convert_general_op(node) op = self.convert_general_op(node)
...@@ -925,6 +982,18 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -925,6 +982,18 @@ class OnnxConverter(base_converter.ConverterInterface):
op = self.convert_general_op(node) op = self.convert_general_op(node)
op.type = MaceOp.BatchNorm.name op.type = MaceOp.BatchNorm.name
def convert_pad_context(self, node):
op = self.convert_general_op(node)
op.type = MaceOp.PadContext.name
if 'left_context' in node.attrs:
left_context_arg = op.arg.add()
left_context_arg.name = 'left_context'
left_context_arg.i = node.attrs['left_context']
if 'right_context' in node.attrs:
right_context_arg = op.arg.add()
right_context_arg.name = 'right_context'
right_context_arg.i = node.attrs['right_context']
def convert_pnorm(self, node): def convert_pnorm(self, node):
op = self.convert_general_op(node) op = self.convert_general_op(node)
op.type = MaceOp.PNorm.name op.type = MaceOp.PNorm.name
...@@ -1010,10 +1079,10 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -1010,10 +1079,10 @@ class OnnxConverter(base_converter.ConverterInterface):
op = self.convert_general_op(node) op = self.convert_general_op(node)
op.type = MaceOp.Softmax.name op.type = MaceOp.Softmax.name
# TODO: add logsoftmax in softmax op # TODO: add logsoftmax in softmax op
# if node.op_type == OnnxOpType.LogSoftmax.name: if node.op_type == OnnxOpType.LogSoftmax.name:
# use_log_arg = op.arg.add() use_log_arg = op.arg.add()
# use_log_arg.name = 'use_log' use_log_arg.name = 'use_log'
# use_log_arg.i = 1 use_log_arg.i = 1
def convert_splice(self, node): def convert_splice(self, node):
op = self.convert_general_op(node) op = self.convert_general_op(node)
...@@ -1104,6 +1173,11 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -1104,6 +1173,11 @@ class OnnxConverter(base_converter.ConverterInterface):
else: else:
op.type = MaceOp.TimeOffset.name op.type = MaceOp.TimeOffset.name
chunk_size = node.attrs['chunk_size']
chunk_size_arg = op.arg.add()
chunk_size_arg.name = 'chunk_size'
chunk_size_arg.i = chunk_size
offset_arg = op.arg.add() offset_arg = op.arg.add()
offset_arg.name = 'offset' offset_arg.name = 'offset'
offset_arg.i = offset offset_arg.i = offset
...@@ -1143,6 +1143,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1143,6 +1143,7 @@ class Transformer(base_converter.ConverterInterface):
filter.float_data[:] = filter_data.flat filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape filter.dims[:] = filter_data.shape
arg.i = 1 arg.i = 1
six.print_('transpose matmul weight')
def transpose_filters(self): def transpose_filters(self):
net = self._model net = self._model
...@@ -1192,7 +1193,6 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1192,7 +1193,6 @@ class Transformer(base_converter.ConverterInterface):
mace_check(filter_format == DataFormat.HWIO, mace_check(filter_format == DataFormat.HWIO,
"HEXAGON only support HWIO/HWIM filter format.") "HEXAGON only support HWIO/HWIM filter format.")
else: else:
print("Transpose filters to OIHW/MIHW")
# transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM) # transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM)
if filter_format == DataFormat.HWIO: if filter_format == DataFormat.HWIO:
for op in net.op: for op in net.op:
...@@ -1201,6 +1201,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1201,6 +1201,7 @@ class Transformer(base_converter.ConverterInterface):
or op.type == MaceOp.DepthwiseConv2d.name) \ or op.type == MaceOp.DepthwiseConv2d.name) \
and op.input[1] in self._consts \ and op.input[1] in self._consts \
and op.input[1] not in transposed_filter: and op.input[1] not in transposed_filter:
print("Transpose Conv2D/Deconv2D filters to OIHW/MIHW")
filter = self._consts[op.input[1]] filter = self._consts[op.input[1]]
filter_data = np.array(filter.float_data).reshape( filter_data = np.array(filter.float_data).reshape(
filter.dims) filter.dims)
...@@ -1208,9 +1209,13 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1208,9 +1209,13 @@ class Transformer(base_converter.ConverterInterface):
filter.float_data[:] = filter_data.flat filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape filter.dims[:] = filter_data.shape
transposed_filter.add(op.input[1]) transposed_filter.add(op.input[1])
if (op.type == MaceOp.MatMul.name if (op.type == MaceOp.MatMul.name and
and (ConverterUtil.get_arg(op, MaceKeyword.mace_winograd_filter_transformed) is not None) # noqa (ConverterUtil.get_arg(
op,
MaceKeyword.mace_winograd_filter_transformed)
is not None) # noqa
and op.input[1] not in transposed_filter): and op.input[1] not in transposed_filter):
print("Transpose Winograd filters to OIHW/MIHW")
filter = self._consts[op.input[0]] filter = self._consts[op.input[0]]
filter_data = np.array(filter.float_data).reshape( filter_data = np.array(filter.float_data).reshape(
filter.dims) filter.dims)
...@@ -1222,6 +1227,8 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1222,6 +1227,8 @@ class Transformer(base_converter.ConverterInterface):
and op.input[1] not in transposed_filter: and op.input[1] not in transposed_filter:
weight = self._consts[op.input[1]] weight = self._consts[op.input[1]]
if len(weight.dims) == 4: if len(weight.dims) == 4:
print("Transpose FullyConnected filters to"
" OIHW/MIHW")
weight_data = np.array(weight.float_data).reshape( weight_data = np.array(weight.float_data).reshape(
weight.dims) weight.dims)
weight_data = weight_data.transpose(3, 2, 0, 1) weight_data = weight_data.transpose(3, 2, 0, 1)
......
...@@ -60,25 +60,22 @@ inline Integer Clamp(Integer in, Integer low, Integer high) { ...@@ -60,25 +60,22 @@ inline Integer Clamp(Integer in, Integer low, Integer high) {
return std::max<Integer>(low, std::min<Integer>(in, high)); return std::max<Integer>(low, std::min<Integer>(in, high));
} }
template <typename T> inline float ScalarSigmoid(float in) {
inline T ScalarSigmoid(T in) { if (in > 0) {
if (in > static_cast<T>(0)) { return 1 / (1 + std::exp(-in));
return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
} else { } else {
T x = std::exp(in); float x = std::exp(in);
return x / (x + static_cast<T>(1)); return x / (x + 1.f);
} }
} }
template <typename T> inline float ScalarTanh(float in) {
inline T ScalarTanh(T in) { if (in > 0) {
if (in > static_cast<T>(0)) { float x = std::exp(-in);
T inv_expa = std::exp(-in); return -1.f + 2.f / (1.f + x * x);
return -static_cast<T>(1) +
static_cast<T>(2) / (static_cast<T>(1) + inv_expa * inv_expa);
} else { } else {
T x = std::exp(in); float x = std::exp(in);
return x / (x + static_cast<T>(1)); return 1.f - 2.f / (1.f + x * x);
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册