提交 8202e25d 编写于 作者: N nhzlx

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into xzl/incubate/lite

......@@ -114,6 +114,32 @@ build:mobile_armlinux:
- $MOBILE_LITE_CACHE1
- ~/.ccache
build:mobile_model_mobilenetv1:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv1
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv1
dependencies:
- build:server
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model_mobilenetv1
build:mobile_model_mobilenetv2:
tags:
- lite
......@@ -126,8 +152,34 @@ build:mobile_model_mobilenetv2:
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model1
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv2
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv2
dependencies:
- build:server
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model_mobilenetv2
build:mobile_model_resnet50:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_resnet50
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model_resnet50
dependencies:
- build:server
......@@ -138,4 +190,30 @@ build:mobile_model_mobilenetv2:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model1
- $CI_PROJECT_DIR/build_mobile_model_resnet50
#build:mobile_model_inceptionv4:
# tags:
# - lite
# stage: build_mobile
# image: $MOBILE_LITE_DOCKER_IMAGE
# cache:
# key: mobile_thirdparty
# paths:
# - $MOBILE_LITE_CACHE0
# - $MOBILE_LITE_CACHE1
# - ~/.ccache
# script:
# - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_inceptionv4
# - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_inceptionv4
#
# dependencies:
# - build:server
#
# cache:
# key: mobile_thirdparty
# paths:
# - $MOBILE_LITE_CACHE0
# - $MOBILE_LITE_CACHE1
# - ~/.ccache
# - $CI_PROJECT_DIR/build_mobile_model_inceptionv4
......@@ -56,6 +56,16 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
include(cross_compiling/host)
include(cross_compiling/armlinux)
include(cross_compiling/android)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"Default use Release in android" FORCE)
endif()
if(NOT THIRD_PARTY_BUILD_TYPE)
set(THIRD_PARTY_BUILD_TYPE "MinSizeRel" CACHE STRING
"Default use MinSizeRel in android" FORCE)
endif()
endif()
project(paddle CXX C)
......@@ -133,15 +143,6 @@ if(ANDROID OR IOS OR ARMLINUX)
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"Default use Release in android" FORCE)
endif()
if(NOT THIRD_PARTY_BUILD_TYPE)
set(THIRD_PARTY_BUILD_TYPE "MinSizeRel" CACHE STRING
"Default use MinSizeRel in android" FORCE)
endif()
endif()
# for lite, both server and mobile framework.
......
......@@ -190,6 +190,9 @@ add_subdirectory(gen_code)
if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4.tar.gz")
endif()
endif()
......@@ -33,24 +33,37 @@ include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
if(WITH_TESTING)
set(eval_model_dir "")
set(test_cxx_api_deps cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${x86_kernels})
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
set(eval_model_dir ${LITE_MODEL_DIR}/mobilenet_v2_relu)
set(test_cxx_api_deps ${test_cxx_api_deps} ${arm_kernels})
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
DEPS ${test_cxx_api_deps}
DEPS cxx_api_lite mir_passes
${ops_lite} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
--eval_model_dir=eval_model_dir SERIAL)
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
add_dependencies(test_cxx_api_lite extern_lite_download_mobilenet_v2_relu_tar_gz)
endif()
endif()
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels})
lite_cc_test(test_mobilenetv1_lite SRCS mobilenetv1_test.cc
DEPS ${lite_model_test_DEPS}
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
add_dependencies(test_mobilenetv1_lite extern_lite_download_mobilenet_v1_tar_gz)
lite_cc_test(test_mobilenetv2_lite SRCS mobilenetv2_test.cc
DEPS ${lite_model_test_DEPS}
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2 SERIAL)
add_dependencies(test_mobilenetv2_lite extern_lite_download_mobilenet_v2_tar_gz)
lite_cc_test(test_resnet50_lite SRCS resnet50_test.cc
DEPS ${lite_model_test_DEPS}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL)
add_dependencies(test_resnet50_lite extern_lite_download_resnet50_tar_gz)
lite_cc_test(test_inceptionv4_lite SRCS inceptionv4_test.cc
DEPS ${lite_model_test_DEPS}
ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4_lite extern_lite_download_inception_v4_tar_gz)
endif()
# These tests needs CLI arguments, and is not supported in ARM CI.
......
......@@ -27,9 +27,6 @@
DEFINE_string(startup_program_path, "", "");
DEFINE_string(main_program_path, "", "");
// for eval
DEFINE_string(eval_model_dir, "", "");
namespace paddle {
namespace lite {
......@@ -88,37 +85,5 @@ TEST(CXXApi, save_model) {
}*/
#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
#ifdef LITE_WITH_ARM
TEST(CXXApi, eval) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
predictor.Build(FLAGS_eval_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < input_tensor->dims().production(); i++) {
data[i] = 1;
}
predictor.Run();
auto* out = predictor.GetOutput(0);
std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
0.00098268, 0.00104065, 0.00099962, 0.00095181,
0.00099694, 0.00099406});
for (int i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
}
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
}
#endif
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(InceptionV4, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < input_tensor->dims().production(); i++) {
data[i] = 1;
}
predictor.Run();
auto* out = predictor.GetOutput(0);
std::vector<float> results({0.00078033, 0.00083865, 0.00060029, 0.00057083,
0.00070094, 0.00080584, 0.00044525, 0.00074907,
0.00059774, 0.00063654});
for (int i = 0; i < results.size(); ++i) {
// TODO(sangoly): fix assert
// EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
LOG(INFO) << "out -> " << out->data<float>()[i];
}
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
}
#endif
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(MobileNetV1, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < input_tensor->dims().production(); i++) {
data[i] = 1;
}
predictor.Run();
auto* out = predictor.GetOutput(0);
std::vector<float> results({1.91308980e-04, 5.92055148e-04, 1.12303176e-04,
6.27335685e-05, 1.27507330e-04, 1.32147351e-03,
3.13812525e-05, 6.52209565e-05, 4.78087313e-05,
2.58822285e-04});
for (int i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
}
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
}
#endif
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(MobileNetV2, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < input_tensor->dims().production(); i++) {
data[i] = 1;
}
predictor.Run();
auto* out = predictor.GetOutput(0);
std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
0.00098268, 0.00104065, 0.00099962, 0.00095181,
0.00099694, 0.00099406});
for (int i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
}
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
}
#endif
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
// for eval
DEFINE_string(model_dir, "", "");
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
TEST(ResNet50, test) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < input_tensor->dims().production(); i++) {
data[i] = 1;
}
predictor.Run();
auto* out = predictor.GetOutput(0);
std::vector<float> results({2.41399175e-04, 4.13724629e-04, 2.64324830e-04,
9.68795503e-05, 2.01968738e-04, 8.14945495e-04,
7.45922662e-05, 1.76479152e-04, 7.47223166e-05,
6.06825110e-04});
for (int i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
}
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
}
#endif
} // namespace lite
} // namespace paddle
......@@ -16,7 +16,7 @@ cc_library(math_arm SRCS
elementwise.cc
concat.cc
sgemv.cc
type_trans.cpp
type_trans.cc
conv_impl.cc
conv_direct_3x3s1.cc
conv_direct_3x3s2.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/arm/math/type_trans.h"
#include <arm_neon.h>
#include <string.h>
#include "paddle/fluid/lite/arm/math/saturate.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <typename dtype>
void int32_to_dtype(const int* din, dtype* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size);
void fp32_to_int8(const float* din, signed char* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for
for (int j = 0; j < loop_size; ++j) {
float inv_scale = 1.f / scale[j % axis_size];
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vscale = vdupq_n_f32(inv_scale);
float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f);
const float* din_c = din + j * inner_size;
signed char* dout_c = dout + j * inner_size;
if (cnt > 0) {
int cnt_loop = cnt;
const float* din_ptr = din_c;
signed char* dout_ptr = dout_c;
#ifdef __aarch64__
asm volatile(
"ldp q0, q1, [%[in]], #32 \n"
"ldp q2, q3, [%[in]], #32 \n"
"0: \n" /* main loop */
"fmul v4.4s, v0.4s, %[scale].4s \n"
"fmul v5.4s, v1.4s, %[scale].4s \n"
"fmul v6.4s, v2.4s, %[scale].4s \n"
"fmul v7.4s, v3.4s, %[scale].4s \n"
"ldp q0, q1, [%[in]], #32 \n"
"subs %[cnt], %[cnt], #1 \n"
"FCVTAS v8.4s, v4.4s \n"
"FCVTAS v9.4s, v5.4s \n"
"FCVTAS v10.4s, v6.4s \n"
"FCVTAS v11.4s, v7.4s \n"
"ldp q2, q3, [%[in]], #32 \n"
"sqxtn v4.4h, v8.4s \n"
"sqxtn2 v4.8h, v9.4s \n"
"sqxtn v5.4h, v10.4s \n"
"sqxtn2 v5.8h, v11.4s \n"
"sqxtn v8.8b, v4.8h \n"
"sqxtn2 v8.16b, v5.8h \n"
"str q8, [%[out]], #16 \n"
"bne 0b \n"
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"0: @ main loop\n"
"vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n"
"vand.i32 q5, q4, q4 @ set offset, 0.5\n"
"vand.i32 q6, q4, q4 @ set offset, 0.5\n"
"vand.i32 q7, q4, q4 @ set offset, 0.5\n"
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n"
"vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n"
"vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset\n"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset\n"
"vbif.f32 q6, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q7, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q4, q0, %q[vscale] @ mul scale\n"
"vmla.f32 q5, q1, %q[vscale] @ mul scale\n"
"vmla.f32 q6, q2, %q[vscale] @ mul scale\n"
"vmla.f32 q7, q3, %q[vscale] @ mul scale\n"
"vcvt.s32.f32 q0, q4 @ cvt to int32\n"
"vcvt.s32.f32 q1, q5 @ cvt to int32\n"
"vcvt.s32.f32 q2, q6 @ cvt to int32\n"
"vcvt.s32.f32 q3, q7 @ cvt to int32\n"
"vqmovn.s32 d8, q0 @ cnt to int16\n"
"vqmovn.s32 d9, q1 @ cnt to int16\n"
"vqmovn.s32 d10, q2 @ cnt to int16\n"
"vqmovn.s32 d11, q3 @ cnt to int16\n"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vqmovn.s16 d12, q4 @ cnt to int8\n"
"vqmovn.s16 d13, q5 @ cnt to int8\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"vst1.32 {d12-d13}, [%[dout]]! @ write to output\n"
"subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n"
: [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
: [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
[vzero] "w"(vzero)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif
}
const float* din_r = din_c + 16 * cnt;
signed char* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
}
}
}
void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 8;
int remain = inner_size & 7;
int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for
for (int j = 0; j < loop_size; ++j) {
float inv_scale = 1.f / scale[j % axis_size];
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vscale = vdupq_n_f32(inv_scale);
float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f);
const float* din_c = din + j * inner_size;
int16_t* dout_c = dout + j * inner_size;
if (cnt > 0) {
int cnt_loop = cnt;
const float* din_ptr = din_c;
int16_t* dout_ptr = dout_c;
#ifdef __aarch64__
asm volatile(
"ldp q0, q1, [%[in]], #32 \n"
"0: \n" /* main loop */
"fmul v4.4s, v0.4s, %[scale].4s \n"
"fmul v5.4s, v1.4s, %[scale].4s \n"
"ldp q0, q1, [%[in]], #32 \n"
"subs %[cnt], %[cnt], #1 \n"
"FCVTAS v8.4s, v4.4s \n"
"FCVTAS v9.4s, v5.4s \n"
"sqxtn v4.4h, v8.4s \n"
"sqxtn2 v4.8h, v9.4s \n"
"str q4, [%[out]], #16 \n"
"bne 0b \n"
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v8", "v9");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"0: @ main loop\n"
"vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n"
"vand.i32 q5, q4, q4 @ set offset, 0.5\n"
"vand.i32 q6, q4, q4 @ set offset, 0.5\n"
"vand.i32 q7, q4, q4 @ set offset, 0.5\n"
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset\n"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset\n"
"vmla.f32 q4, q0, %q[vscale] @ mul scale\n"
"vmla.f32 q5, q1, %q[vscale] @ mul scale\n"
"vcvt.s32.f32 q0, q4 @ cvt to int32\n"
"vcvt.s32.f32 q1, q5 @ cvt to int32\n"
"vqmovn.s32 d8, q0 @ cnt to int16\n"
"vqmovn.s32 d9, q1 @ cnt to int16\n"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vst1.32 {d8-d9}, [%[dout]]! @ write to output\n"
"subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n"
: [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
: [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
[vzero] "w"(vzero)
: "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
#endif
}
const float* din_r = din_c + 8 * cnt;
int16_t* dout_r = dout_c + 8 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int16_t>(roundf(inv_scale * din_r[i]));
}
}
}
void int8_to_fp32(const signed char* in, float* out, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const signed char* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) {
int loop = cnt;
const signed char* din_ptr = din_c;
float* dout_ptr = dout_c;
#ifdef __aarch64__
asm volatile(
"ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/
"0: \n" /* main loop */
"sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/
"sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/
"sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/
"sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/
"sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/
"sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/
"ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/
"scvtf v8.4s, v4.4s \n" /* trans to fp32*/
"scvtf v9.4s, v5.4s \n" /* trans to fp32*/
"scvtf v10.4s, v6.4s \n" /* trans to fp32*/
"scvtf v11.4s, v7.4s \n" /* trans to fp32*/
"subs %[loop], %[loop], #1 \n"
"fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/
"fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/
"fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
"fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
"stp q4, q5, [%[out]], #32 \n" /* write to memory*/
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11");
#else
asm volatile(
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n"
"0: @ main loop\n"
"vmovl.s8 q2, d0 @ trans to int16\n"
"vmovl.s8 q3, d1 @ trans to int16\n"
"vmovl.s16 q4, d4 @ trans to int32\n"
"vmovl.s16 q5, d5 @ trans to int32\n"
"vmovl.s16 q6, d6 @ trans to int32\n"
"vmovl.s16 q7, d7 @ trans to int32\n"
"vcvt.f32.s32 q0, q4 @ trans to fp32\n"
"vcvt.f32.s32 q1, q5 @ trans to fp32\n"
"vcvt.f32.s32 q2, q6 @ trans to fp32\n"
"vcvt.f32.s32 q3, q7 @ trans to fp32\n"
"vmul.f32 q4, q0, %q[scale] @ mul with scale\n"
"vmul.f32 q5, q1, %q[scale] @ mul with scale\n"
"vmul.f32 q6, q2, %q[scale] @ mul with scale\n"
"vmul.f32 q7, q3, %q[scale] @ mul with scale\n"
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n"
"subs %[loop], #1 \n"
"vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n"
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
}
const signed char* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = in_scale * din_r[i];
}
}
}
void int16_to_fp32(const int16_t* in, float* out, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const int16_t* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) {
int loop = cnt;
const int16_t* din_ptr = din_c;
float* dout_ptr = dout_c;
#ifdef __aarch64__
asm volatile(
"ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/
"0: \n" /* main loop */
"sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/
"sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/
"sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/
"sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/
"ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/
"scvtf v8.4s, v4.4s \n" /* trans to fp32*/
"scvtf v9.4s, v5.4s \n" /* trans to fp32*/
"scvtf v10.4s, v6.4s \n" /* trans to fp32*/
"scvtf v11.4s, v7.4s \n" /* trans to fp32*/
"subs %[loop], %[loop], #1 \n"
"fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/
"fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/
"fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
"fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
"stp q4, q5, [%[out]], #32 \n" /* write to memory*/
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n"
"0: @ main loop\n"
"vmovl.s16 q4, d0 @ trans to int32\n"
"vmovl.s16 q5, d1 @ trans to int32\n"
"vmovl.s16 q6, d2 @ trans to int32\n"
"vmovl.s16 q7, d3 @ trans to int32\n"
"vcvt.f32.s32 q0, q4 @ trans to fp32\n"
"vcvt.f32.s32 q1, q5 @ trans to fp32\n"
"vcvt.f32.s32 q2, q6 @ trans to fp32\n"
"vcvt.f32.s32 q3, q7 @ trans to fp32\n"
"vmul.f32 q4, q0, %q[scale] @ mul with scale\n"
"vmul.f32 q5, q1, %q[scale] @ mul with scale\n"
"vmul.f32 q6, q2, %q[scale] @ mul with scale\n"
"vmul.f32 q7, q3, %q[scale] @ mul with scale\n"
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n"
"subs %[loop], #1 \n"
"vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n"
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
}
const int16_t* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = in_scale * din_r[i];
}
}
}
void int32_to_fp32(const int* din, float* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size;
float* dout_c = dout + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) {
int loop = cnt;
const int* din_ptr = din_c;
float* dout_ptr = dout_c;
#ifdef __aarch64__
asm volatile(
"ldp q0, q1, [%[in]], #32 \n"
"ldp q2, q3, [%[in]], #32 \n"
"0: \n"
"scvtf v4.4s, v0.4s \n"
"scvtf v5.4s, v1.4s \n"
"scvtf v6.4s, v2.4s \n"
"scvtf v7.4s, v3.4s \n"
"ldp q0, q1, [%[in]], #32 \n"
"fmul v8.4s, v4.4s, %[scale].4s \n"
"fmul v9.4s, v5.4s, %[scale].4s \n"
"fmul v10.4s, v6.4s, %[scale].4s \n"
"fmul v11.4s, v7.4s, %[scale].4s \n"
"ldp q2, q3, [%[in]], #32 \n"
"stp q8, q9, [%[out]], #32 \n"
"stp q10, q11, [%[out]], #32 \n"
"subs %[loop], %[loop], #1 \n"
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11");
#else
asm volatile(
"vld1.s32 {d0-d3}, [%[in]]! \n"
"vld1.s32 {d4-d7}, [%[in]]! \n"
"0: \n"
"vcvt.f32.s32 q4, q0 \n"
"vcvt.f32.s32 q5, q1 \n"
"vcvt.f32.s32 q6, q2 \n"
"vcvt.f32.s32 q7, q3 \n"
"vld1.s32 {d0-d3}, [%[in]]! \n"
"vmul.f32 q8, q4, %q[scale] \n"
"vmul.f32 q9, q5, %q[scale] \n"
"vmul.f32 q10, q6, %q[scale] \n"
"vmul.f32 q11, q7, %q[scale] \n"
"vld1.s32 {d4-d7}, [%[in]]! \n"
"subs %[loop], #1 \n"
"vst1.f32 {d16-d19}, [%[out]]! \n"
"vst1.f32 {d20-d23}, [%[out]]! \n"
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif // __aarch64__
}
const int* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = in_scale * din_r[i];
}
}
}
void int32_to_int8(const int* din, signed char* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size;
signed char* dout_c = dout + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale);
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f);
if (cnt > 0) {
int loop = cnt;
const int* din_ptr = din_c;
signed char* dout_ptr = dout_c;
#ifdef __aarch64__
asm volatile(
"0: \n"
"ld1 {v0.4s, v1.4s}, [%[in]], #32 \n"
"ld1 {v2.4s, v3.4s}, [%[in]], #32 \n"
"scvtf v4.4s, v0.4s \n"
"scvtf v5.4s, v1.4s \n"
"scvtf v6.4s, v2.4s \n"
"scvtf v7.4s, v3.4s \n"
"fmul v0.4s, v4.4s, %[scale].4s \n"
"fmul v1.4s, v5.4s, %[scale].4s \n"
"fmul v2.4s, v6.4s, %[scale].4s \n"
"fmul v3.4s, v7.4s, %[scale].4s \n"
"fcvtas v4.4s, v0.4s \n"
"fcvtas v5.4s, v1.4s \n"
"fcvtas v6.4s, v2.4s \n"
"fcvtas v7.4s, v3.4s \n"
"sqxtn v0.4h, v4.4s \n"
"sqxtn2 v0.8h, v5.4s \n"
"sqxtn v1.4h, v6.4s \n"
"sqxtn2 v1.8h, v7.4s \n"
"sqxtn v2.8b, v0.8h \n"
"sqxtn2 v2.16b, v1.8h \n"
"st1 {v2.16b}, [%[out]], #16 \n"
"subs %[loop], %[loop], #1 \n"
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"0: @ main loop\n"
"vcvt.f32.s32 q4, q0 @ cvt to float\n"
"vcvt.f32.s32 q5, q1 @ cvt to float\n"
"vcvt.f32.s32 q6, q2 @ cvt to float\n"
"vcvt.f32.s32 q7, q3 @ cvt to float\n"
"vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n"
"vand.i32 q1, q0, q0 @ set offset, 0.5\n"
"vand.i32 q2, q0, q0 @ set offset, 0.5\n"
"vand.i32 q3, q0, q0 @ set offset, 0.5\n"
"vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n"
"vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n"
"vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q0, %q[vnoff], q8 @ get right offset\n"
"vbif.f32 q1, %q[vnoff], q9 @ get right offset\n"
"vbif.f32 q2, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q3, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q0, q4, %q[vscale] @ mul scale\n"
"vmla.f32 q1, q5, %q[vscale] @ mul scale\n"
"vmla.f32 q2, q6, %q[vscale] @ mul scale\n"
"vmla.f32 q3, q7, %q[vscale] @ mul scale\n"
"vcvt.s32.f32 q4, q0 @ cvt to int32\n"
"vcvt.s32.f32 q5, q1 @ cvt to int32\n"
"vcvt.s32.f32 q6, q2 @ cvt to int32\n"
"vcvt.s32.f32 q7, q3 @ cvt to int32\n"
"vqmovn.s32 d16, q4 @ cnt to int16\n"
"vqmovn.s32 d17, q5 @ cnt to int16\n"
"vqmovn.s32 d18, q6 @ cnt to int16\n"
"vqmovn.s32 d19, q7 @ cnt to int16\n"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vqmovn.s16 d8, q8 @ cnt to int8\n"
"vqmovn.s16 d9, q9 @ cnt to int8\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"vst1.32 {d8-d9}, [%[dout]]! @ write to output\n"
"subs %[loop], #1 @ loop count -1\n"
"bne 0b @ to main loop\n"
: [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
: [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff),
[vpoff] "w"(vpoff)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif // __aarch64__
}
const int* din_r = din_c + 16 * cnt;
int8_t* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
}
}
}
void int32_to_int32(const int* din, int* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int size_all = outer_size * axis_size * inner_size;
memmove(dout, din, size_all * sizeof(int));
}
template <>
void int32_to_dtype(const int* din, float* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
}
template <>
void int32_to_dtype(const int* din, signed char* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
}
template <>
void int32_to_dtype(const int* din, int* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
......@@ -52,8 +52,11 @@ cc_library(mir_passes
# X86_DEPS mul_compute_x86
# )
lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS mir_node mir_ssa_graph op_lite)
set(pattern_deps mir_node mir_ssa_graph op_lite)
if (WITH_TESTING)
list(APPEND pattern_deps gtest)
endif()
lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS ${pattern_deps})
lite_cc_test(test_pattern_matcher_lite SRCS pattern_matcher_test.cc DEPS pattern_matcher_lite)
lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher_lite)
......
......@@ -16,6 +16,7 @@ cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_a
cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(dropout_compute_arm SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(calib_compute_arm SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(transpose_compute_arm SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
......@@ -30,6 +31,7 @@ lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
lite_cc_test(test_calib_compute_arm SRCS calib_compute_test.cc DEPS calib_compute_arm)
lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm)
set(arm_kernels
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
#include <vector>
#include "paddle/fluid/lite/arm/math/type_trans.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void CalibCompute::Run() {
auto& param = this->Param<operators::CalibParam>();
std::vector<float> scale = {param.in_scale};
if (param.in_dtype == PRECISION(kFloat) &&
param.out_dtype == PRECISION(kInt8)) {
const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<signed char>();
lite::arm::math::fp32_to_int8(din, dout, scale.data(), 1, 1,
param.input->numel());
return;
}
if (param.in_dtype == PRECISION(kInt8) &&
param.out_dtype == PRECISION(kFloat)) {
const auto* din = param.input->data<signed char>();
auto* dout = param.output->mutable_data<float>();
lite::arm::math::int8_to_fp32(din, dout, scale.data(), 1, 1,
param.input->numel());
return;
}
LOG(FATAL) << "Unsupport Dtype.";
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(calib, kARM, kInt8, kNCHW,
paddle::lite::kernels::arm::CalibCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/operators/calib_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class CalibCompute : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
public:
using param_t = operators::CalibParam;
void Run() override;
~CalibCompute() override{};
private:
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
static int get_rand(int start, int end) {
int i = rand(); // NOLINT
i = (i % (end - start)) + start;
return i;
}
static void int8_to_fp32_basic(const int8_t* din, float* dout,
const float* scale, int axis_size,
int64_t outer_size, int64_t inner_size) {
int loop_size = axis_size * outer_size;
for (int i = 0; i < loop_size; ++i) {
float scale_in = scale[i % axis_size];
for (int j = 0; j < inner_size; ++j) {
dout[j] = din[j] * scale_in;
}
dout += inner_size;
din += inner_size;
}
}
static void fp32_to_int8_basic(const float* din, int8_t* dout,
const float* scale, int axis_size,
int64_t outer_size, int64_t inner_size) {
int loop_size = axis_size * outer_size;
for (int i = 0; i < loop_size; ++i) {
float inv_scale = 1.f / scale[i % axis_size];
for (int j = 0; j < inner_size; ++j) {
dout[j] = static_cast<int8_t>(roundf(din[j] * inv_scale));
}
dout += inner_size;
din += inner_size;
}
}
void calib_ref(const operators::CalibParam& param) {
std::vector<float> scale = {param.in_scale};
if (param.in_dtype == PRECISION(kFloat) &&
param.out_dtype == PRECISION(kInt8)) {
const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<signed char>();
fp32_to_int8_basic(din, dout, scale.data(), 1, 1, param.input->numel());
return;
}
if (param.in_dtype == PRECISION(kInt8) &&
param.out_dtype == PRECISION(kFloat)) {
const auto* din = param.input->data<signed char>();
auto* dout = param.output->mutable_data<float>();
int8_to_fp32_basic(din, dout, scale.data(), 1, 1, param.input->numel());
return;
}
LOG(FATAL) << "Unsupport Dtype.";
}
TEST(calib_arm, retrive_op) {
auto calib =
KernelRegistry::Global()
.Create<TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)>("calib");
ASSERT_FALSE(calib.empty());
ASSERT_TRUE(calib.front());
}
TEST(calib_arm, init) {
CalibCompute calib;
ASSERT_EQ(calib.precision(), PRECISION(kInt8));
ASSERT_EQ(calib.target(), TARGET(kARM));
}
TEST(calib_arm, int8_to_fp32) {
DeviceInfo::Init();
for (auto n : {1, 2}) {
for (auto c : {6, 32 /*, 128*/}) {
for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
Tensor x;
Tensor output;
Tensor output_ref;
// set the dims of input, output, ref output tensors
x.Resize({n, c, h, w});
output.Resize({n, c, h, w});
output_ref.Resize({n, c, h, w});
// initialize the data of input tensors
auto* x_data = x.mutable_data<char>();
auto* output_data = output.mutable_data<float>();
for (int i = 0; i < x.dims().production(); i++) {
float sign = i % 3 == 0 ? -1.0f : 1.0f;
x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
}
// prepare kernel params and run
CalibCompute calib;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
calib.SetContext(std::move(ctx));
operators::CalibParam param;
param.in_scale = get_rand(0, 100) * 0.1f;
param.in_dtype = PRECISION(kInt8);
param.out_dtype = PRECISION(kFloat);
param.input = &x;
param.output = &output;
calib.SetParam(param);
calib.Launch();
// invoking ref implementation and compare results
param.output = &output_ref;
calib_ref(param);
auto* output_ref_data = output_ref.mutable_data<float>();
for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
}
}
}
}
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, def);
......@@ -44,4 +44,5 @@ REGISTER_LITE_KERNEL(dropout, kARM, kFloat, kNCHW,
.BindInput("dropout_prob", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("dropout_implementation", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -47,6 +47,8 @@ USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
#endif
#ifdef LITE_WITH_CUDA
......
......@@ -31,13 +31,13 @@ namespace x86 {
template <typename T>
class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ReluParam;
using param_t = operators::ActivationParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto n = param.input->dims().production();
const float* input = param.input->data<float>();
float* output = param.output->mutable_data<float>();
auto n = param.X->dims().production();
const float* input = param.X->data<float>();
float* output = param.Out->mutable_data<float>();
for (int i = 0; i < n; i++) {
output[i] = std::max(0.f, input[i]);
}
......
......@@ -53,10 +53,10 @@ TEST(relu_x86, run_test) {
}
// ReluCompute relu;
ReluCompute<float> relu;
operators::ReluParam param;
operators::ActivationParam param;
param.input = &x;
param.output = &out;
param.X = &x;
param.Out = &out;
relu.SetParam(param);
relu.Run();
......
......@@ -21,6 +21,7 @@ cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS})
cc_library(calib_op_lite SRCS calib_op.cc DEPS ${op_DEPS})
cc_library(split_op_lite SRCS split_op.cc DEPS ${op_DEPS})
cc_library(transpose_op_lite SRCS transpose_op.cc DEPS ${op_DEPS})
cc_library(fake_quant SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
......@@ -46,6 +47,7 @@ set(ops_lite
activation_ops_lite
dropout_op_lite
concat_op_lite
calib_op_lite
split_op_lite
transpose_op_lite
fake_quant
......@@ -64,6 +66,7 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m
lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
lite_cc_test(test_calib_op_lite SRCS calib_op_test.cc DEPS calib_op_lite memory_lite ARM_DEPS calib_compute_arm)
lite_cc_test(test_fusion_elementwise_activation_ops_lite
SRCS fusion_elementwise_activation_ops_test.cc
DEPS fusion_elementwise_activation_ops_lite memory_lite)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/calib_op.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool CalibOpLite::CheckShape() const {
CHECK_OR_FALSE(param_.input);
CHECK_OR_FALSE(param_.output);
return true;
}
bool CalibOpLite::InferShape() const {
param_.output->Resize(param_.input->dims());
return true;
}
bool CalibOpLite::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
auto x_var = scope->FindVar(opdesc.Input("Input").front());
auto output_var = scope->FindVar(opdesc.Output("Out").front());
CHECK(x_var);
CHECK(output_var);
param_.input = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
param_.output = output_var->GetMutable<lite::Tensor>();
std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
param_.in_dtype =
static_cast<lite::PrecisionType>(opdesc.GetAttr<int>("in_dtype"));
param_.out_dtype =
static_cast<lite::PrecisionType>(opdesc.GetAttr<int>("out_dtype"));
if (opdesc.HasAttr("in_scale")) {
param_.in_scale = opdesc.GetAttr<float>("in_scale");
}
CHECK(param_.input) << "Input(X) of CalibOp should not be null.";
CHECK(param_.output) << "Output(Out) of CalibOp should not be null.";
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(calib, paddle::lite::operators::CalibOpLite);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_lite.h"
#include "paddle/fluid/lite/core/scope.h"
#include "paddle/fluid/lite/operators/op_params.h"
#include "paddle/fluid/lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
/*
* The data types used by the two adjacent layers in the model should
* be the same. When the two operators accept different data types,
* we may need to implicitly add a data type conversion operator.
* Currently, this operator only supports mutual conversion of int8
* and float32 types.
*/
class CalibOpLite : public OpLite {
public:
CalibOpLite() {}
explicit CalibOpLite(const std::string &type) : OpLite(type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "calib"; }
private:
mutable CalibParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/calib_op.h"
#include <gtest/gtest.h>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
#ifdef LITE_WITH_ARM
TEST(calib_op_lite, TestARM) {
// prepare variables
Scope scope;
auto* x = scope.Var("Input")->GetMutable<Tensor>();
auto* output = scope.Var("output")->GetMutable<Tensor>();
x->Resize(DDim(std::vector<int64_t>({1, 10, 20})));
output->Resize(DDim(std::vector<int64_t>{1, 10, 20}));
// set data
for (int i = 0; i < 10 * 20; i++) {
x->mutable_data<float>()[i] = i;
}
for (int i = 0; i < 10 * 20; i++) {
output->mutable_data<float>()[i] = 0.;
}
// prepare op desc
cpp::OpDesc desc;
desc.SetType("calib");
desc.SetInput("Input", {"Input"});
desc.SetOutput("Out", {"output"});
desc.SetAttr("in_dtype", static_cast<int>(PRECISION(kInt8)));
desc.SetAttr("out_dtype", static_cast<int>(PRECISION(kFloat)));
desc.SetAttr("in_scale", 10.0f);
CalibOpLite calib("calib");
calib.SetValidPlaces({Place{TARGET(kARM), PRECISION(kInt8)}});
calib.Attach(desc, &scope);
auto kernels = calib.CreateKernels({Place{TARGET(kARM), PRECISION(kInt8)}});
ASSERT_FALSE(kernels.empty());
}
#endif
} // namespace operators
} // namespace lite
} // namespace paddle
#ifdef LITE_WITH_ARM
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, def);
#endif
......@@ -52,13 +52,16 @@ class DropoutOpLite : public OpLite {
param_.mask = GetMutableVar<lite::Tensor>(scope, Mask);
param_.dropout_prob = op_desc.GetAttr<float>("dropout_prob");
if (op_desc.HasAttr("is_test")) {
param_.is_test = op_desc.GetAttr<bool>("is_test");
}
param_.is_test = true;
// TODO(sangoly): `is_test` has different attr type in x86 and arm, set
// `true` now.
// if (op_desc.HasAttr("is_test")) {
// param_.is_test = op_desc.GetAttr<bool>("is_test");
// }
param_.fix_seed = op_desc.GetAttr<bool>("fix_seed");
param_.seed = op_desc.GetAttr<int>("seed");
param_.dropout_implementation =
op_desc.GetAttr<int>("dropout_implementation");
op_desc.GetAttr<std::string>("dropout_implementation");
return true;
}
......
......@@ -32,6 +32,7 @@ class ElementwiseOp : public OpLite {
bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "elementwise_op"; }
private:
......
......@@ -20,9 +20,29 @@ namespace paddle {
namespace lite {
namespace operators {
bool FusionElementwiseActivationOp::CheckShape() const {
CHECK_OR_FALSE(param_.X);
CHECK_OR_FALSE(param_.Y);
CHECK_OR_FALSE(param_.Out);
return true;
}
bool FusionElementwiseActivationOp::InferShape() const {
CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
param_.Out->Resize(param_.X->dims());
return true;
}
bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
lite::Scope* scope) {
ElementwiseOp::AttachImpl(opdesc, scope);
auto X_name = opdesc.Input("X").front();
auto Y_name = opdesc.Input("Y").front();
auto Out_name = opdesc.Output("Out").front();
param_.X = GetVar<lite::Tensor>(scope, X_name);
param_.Y = GetVar<lite::Tensor>(scope, Y_name);
param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
param_.axis = opdesc.GetAttr<int>("axis");
param_.act_type = opdesc.GetAttr<std::string>("act_type");
// TODO(sangoly): support more activation types.
CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
......@@ -31,9 +51,31 @@ bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
}
#ifdef LITE_WITH_X86
bool FusionElementwiseActivationGradExplicitOp::CheckShape() const {
CHECK_OR_FALSE(param_.Y);
CHECK_OR_FALSE(param_.X_grad);
CHECK_OR_FALSE(param_.Y_grad);
CHECK_OR_FALSE(param_.Out_grad);
return true;
}
bool FusionElementwiseActivationGradExplicitOp::InferShape() const {
param_.X_grad->Resize(param_.Out_grad->dims());
param_.Y_grad->Resize(param_.Y->dims());
return true;
}
bool FusionElementwiseActivationGradExplicitOp::AttachImpl(
const cpp::OpDesc& opdesc, lite::Scope* scope) {
ElementwiseGradExplicitOp::AttachImpl(opdesc, scope);
CHECK_EQ(opdesc.InputArgumentNames().size(), 1UL);
auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
auto X_name = opdesc.Output(framework::GradVarName("X")).front();
auto Y_name = opdesc.Output(framework::GradVarName("Y")).front();
param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_name);
param_.Y_grad = GetMutableVar<Tensor>(scope, Y_name);
param_.axis = opdesc.GetAttr<int>("axis");
param_.act_type = opdesc.GetAttr<std::string>("act_type");
// TODO(sangoly): support more activation types.
CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
......
......@@ -22,13 +22,19 @@ namespace paddle {
namespace lite {
namespace operators {
class FusionElementwiseActivationOp : public ElementwiseOp {
class FusionElementwiseActivationOp : public OpLite {
public:
explicit FusionElementwiseActivationOp(const std::string& type)
: ElementwiseOp(type) {}
: OpLite(type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override {
return "fusion_elementwise_activation_op";
}
......@@ -38,14 +44,19 @@ class FusionElementwiseActivationOp : public ElementwiseOp {
};
#ifdef LITE_WITH_X86
class FusionElementwiseActivationGradExplicitOp
: public ElementwiseGradExplicitOp {
class FusionElementwiseActivationGradExplicitOp : public OpLite {
public:
explicit FusionElementwiseActivationGradExplicitOp(const std::string& type)
: ElementwiseGradExplicitOp(type) {}
: OpLite(type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override {
return "fusion_elementwise_activation_grad_explicit_op";
}
......
......@@ -48,6 +48,14 @@ struct IoCopyParam {
lite::Tensor* y{};
};
struct CalibParam {
const lite::Tensor* input{};
lite::Tensor* output{};
float in_scale;
PrecisionType in_dtype;
PrecisionType out_dtype;
};
/// -------------------------- NN operators ------------------------------------
struct FcParam {
......@@ -60,11 +68,6 @@ struct FcParam {
bool weight_transposed{false};
};
struct ReluParam {
lite::Tensor* input{};
lite::Tensor* output{};
};
// For Mul Op
struct MulParam {
const lite::Tensor* x{};
......
......@@ -21,22 +21,22 @@ namespace operators {
bool ReluOp::CheckShape() const { return true; }
bool ReluOp::InferShape() const {
CHECK_OR_FALSE(param_.input);
CHECK_OR_FALSE(param_.output);
CHECK_OR_FALSE(param_.X);
CHECK_OR_FALSE(param_.Out);
// TODO(Superjomn) Enable data sharing.
param_.output->Resize(param_.input->dims());
param_.Out->Resize(param_.X->dims());
// share lod
// param_.output->set_lod(param_.input->lod());
// param_.output->set_lod(param_.X->lod());
return true;
}
bool ReluOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
param_.input = const_cast<lite::Tensor *>(
param_.X = const_cast<lite::Tensor *>(
&scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
param_.output =
param_.Out =
scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
CHECK(param_.input);
CHECK(param_.output);
CHECK(param_.X);
CHECK(param_.Out);
return true;
}
......
......@@ -38,7 +38,7 @@ class ReluOp : public OpLite {
std::string DebugString() const override { return "relu"; }
private:
mutable ReluParam param_;
mutable ActivationParam param_;
};
} // namespace operators
......
......@@ -34,3 +34,4 @@ USE_LITE_OP(conv2d)
USE_LITE_OP(depthwise_conv2d)
USE_LITE_OP(pool2d)
USE_LITE_OP(batch_norm)
USE_LITE_OP(fusion_elementwise_sub_activation)
......@@ -99,7 +99,7 @@ function test_arm_android {
echo "test name: ${test_name}"
adb_work_dir="/data/local/tmp"
skip_list=("test_model_parser_lite" "test_cxx_api_lite")
skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite")
for skip_name in ${skip_list[@]} ; do
[[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
done
......@@ -136,7 +136,7 @@ function test_arm_model {
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
}
......@@ -305,8 +305,8 @@ function build_test_arm_subtask_armlinux {
echo "Done"
}
# sub-task3
function build_test_arm_subtask3_mobilenet_v2 {
# sub-task-model
function build_test_arm_subtask_model {
local port_armv8=5554
local port_armv7=5556
# We just test following single one environment to limit the CI time.
......@@ -314,17 +314,20 @@ function build_test_arm_subtask3_mobilenet_v2 {
local abi=armv8
local lang=gcc
local test_name=$1
local model_name=$2
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
mkdir -p $build_dir
cd $build_dir
cmake_arm $os $abi $lang
make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
make $test_name -j$NUM_CORES_FOR_COMPILE
prepare_emulator $port_armv8 $port_armv7
# just test the model on armv8
test_arm_model "test_cxx_api_lite" $port_armv8 "./third_party/install/mobilenet_v2_relu"
test_arm_model $test_name $port_armv8 "./third_party/install/$model_name"
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done"
......@@ -441,8 +444,20 @@ function main {
build_test_arm_subtask_armlinux
shift
;;
build_test_arm_model1)
build_test_arm_subtask3_mobilenet_v2
build_test_arm_model_mobilenetv1)
build_test_arm_subtask_model test_mobilenetv1_lite mobilenet_v1
shift
;;
build_test_arm_model_mobilenetv2)
build_test_arm_subtask_model test_mobilenetv2_lite mobilenet_v2
shift
;;
build_test_arm_model_resnet50)
build_test_arm_subtask_model test_resnet50_lite resnet50
shift
;;
build_test_arm_model_inceptionv4)
build_test_arm_subtask_model test_inceptionv4_lite inception_v4
shift
;;
check_style)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册