enable the concat op to support int8_t type as inputs and outputs.

......@@ -27,7 +27,11 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
template <>
void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) {
if (param.Inputs()[0]->type() == typeid(int8_t)) {
} else {
......@@ -57,8 +57,8 @@ template <typename P>
void ConcatCompute(const ConcatParam<CPU> &param) {
auto inputs = param.Inputs();
auto *out = param.Out();
int64_t axis = param.Axis();
int axis = param.Axis();
/// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && inputs.size() < 10) {
......@@ -66,12 +66,12 @@ void ConcatCompute(const ConcatParam<CPU> &param) {
for (auto *in : inputs) {
auto in_stride = framework::stride_numel(in->dims());
auto out_stride = framework::stride_numel(out->dims());
auto dst = out->data<float>() + output_offset;
auto src = in->data<float>();
auto dst = out->data<P>() + output_offset;
auto src = in->data<P>();
in_stride.size() == out_stride.size(),
"src and dst tensor should have the same dims size.");
memory::Copy(dst, src, sizeof(float) * in_stride[0]);
memory::Copy(dst, src, sizeof(P) * in_stride[0]);
output_offset += in_stride[0];
} else {
......@@ -79,8 +79,8 @@ void ConcatCompute(const ConcatParam<CPU> &param) {
for (int j = 0; j < inputs.size(); ++j) {
inputs_concat[j] = *inputs[j];
ConcatFunctor<float> concat_functor;
concat_functor(inputs_concat, static_cast<int>(axis), out);
ConcatFunctor<P> concat_functor;
concat_functor(inputs_concat, axis, out);
......@@ -12,76 +12,125 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstring>
#include <iostream>
#include <vector>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/concat_op.h"
namespace paddle_mobile {
using framework::AttributeMap;
using framework::DDim;
using framework::LoDTensor;
using framework::Scope;
using framework::make_ddim;
template <typename T>
void concat(const std::vector<LoDTensor> &input, LoDTensor &output, int axis) {
int num = input.size();
int rows = 1;
auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
int out_rows = rows, out_cols = 0;
std::vector<int> input_cols(input.size());
for (int i = 0; i < num; ++i) {
int t_cols = input[i].numel() / rows;
out_cols += t_cols;
input_cols[i] = t_cols;
// computation
auto output_data = output.data<T>();
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = input_cols[j];
auto input_data = input[j].data<T>();
for (int k = 0; k < out_rows; ++k) {
memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len,
sizeof(T) * col_len);
col_idx += col_len;
template <typename T>
int TestConcatOP() {
DDim inputA_shape = make_ddim({10, 4, 2, 2});
DDim inputB_shape = make_ddim({20, 4, 2, 2});
DDim inputC_shape = make_ddim({30, 4, 2, 2});
DDim inputD_shape = make_ddim({40, 4, 2, 2});
DDim output_shape = make_ddim({100, 4, 2, 2});
int axis_v = 0;
VariableNameMap inputs;
VariableNameMap outputs;
std::vector<LoDTensor> input_tensors;
auto scope = std::make_shared<Scope>();
inputs["X"] =
std::vector<std::string>({"inputA", "inputB", "inputC", "inputD"});
outputs["Out"] = std::vector<std::string>({"output"});
auto inputA_var = scope.get()->Var("inputA");
auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputA, inputA_shape, -127, 127);
auto inputB_var = scope.get()->Var("inputB");
auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputB, inputB_shape, -127, 127);
auto inputC_var = scope.get()->Var("inputC");
auto inputC = inputC_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputC, inputC_shape, -127, 127);
auto inputD_var = scope.get()->Var("inputD");
auto inputD = inputD_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputD, inputD_shape, -127, 127);
auto output_var = scope.get()->Var("output");
AttributeMap attrs;
auto *op = new operators::ConcatOp<CPU, float>("concat", inputs, outputs,
attrs, scope);
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
LoDTensor output_cmp;
concat<T>(input_tensors, output_cmp, axis_v);
const T *output_cmp_data = output_cmp.data<T>();
// compare
int eq = 0;
int neq = 0;
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
"The execution of test_concat_op is failed!");
if (output_data[i] == output_cmp_data[i]) {
} else {
std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
delete op;
return 0;
} // namespace paddle_mobile
int main() {
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(g_googlenet);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
paddle_mobile::operators::ConcatOp<paddle_mobile::CPU, float>>
executor(program, "concat");
// 1. input_tensors;
vector<Tensor> input_tensors;
Tensor input1;
auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
Tensor input2;
auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
Tensor input3;
auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
Tensor input4;
auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
// 2. input_names
vector<string> input_names({
// 3. output_names
vector<string> output_names({"concat_0.tmp_0"});
// 4. out_dims;
vector<DDim> out_ddims;
auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
output_names, out_ddims);
auto output0_data = output[0]->data<float>();
// 5. test one example.
int input_n = 1;
int input_c = 2;
int input_h = 0;
int input_w = 1;
int stride0 = input3.numel() / input3.dims()[0];
int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
int stride2 = input3.dims()[3];
/// inputx1 (4,10,2,2),
/// inputx2 (4,20,2,2),
/// inputx3 (4,30,2,2),
/// inputx4 (4,40,2,2),
/// axis = 1
/// output (4,100,2,2)
int input_index =
input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
int output_index = input_n * 100 * 2 * 2 +
(input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
input_h * 2 + input_w;
DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
DLOG << " output [1,32,0,1] = " << output0_data[output_index];
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
return 0;
......@@ -17,8 +17,10 @@ limitations under the License. */
#include "../test_helper.h"
#include "../test_include.h"
#include "framework/operator.h"
#include "operators/fusion_fc_int8_op.h"
#include "operators/fusion_fc_op.h"
#include "operators/fusion_fc_int8_op.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
