欢迎来到 Paddle-Mobile GitHub 项目。
## 简单搜索线上效果
## Demo目录
欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平台的深度学习的框架。
## Features
- **ARM CPU**
- **Mali GPU**
- **苹果设备的GPU Metal实现**
- **FPGA**
目前已经支持 ZCU102 开发板。
- **灵活性**
* paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
* 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
* 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
* 使用 docker 编译, 提供统一的编译环境。
* 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
* 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
- **体积**
- 高性能支持ARM CPU
- 支持Mali GPU
- 支持Andreno GPU
- 支持苹果设备的GPU Metal实现
- 支持ZU5、ZU9等FPGA开发板
- 支持树莓派等arm-linux开发板
## Demo目录
## 文档
## 文档
### 1. 直接使用Paddle Fluid训练
### 2. caffe转为Paddle Fluid模型
### 3. ONNX
ONNX全称为“Open Neural Network Exchange”,即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
除直接使用PaddlePaddle训练fluid版本的模型外,还可以通过onnx转换得到个别Paddle fluid模型。
### 4. 部分测试模型和测试图片下载
## 问题解决
......@@ -97,5 +74,3 @@ Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](L
## 旧版 Mobile-Deep-Learning
原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
......@@ -44,6 +44,7 @@ const char *G_OP_TYPE_RESHAPE2 = "reshape2";
const char *G_OP_TYPE_SIGMOID = "sigmoid";
const char *G_OP_TYPE_SOFTMAX = "softmax";
const char *G_OP_TYPE_TRANSPOSE = "transpose";
const char *G_OP_TYPE_TRANSPOSE2 = "transpose2";
const char *G_OP_TYPE_SPLIT = "split";
const char *G_OP_TYPE_FEED = "feed";
const char *G_OP_TYPE_FETCH = "fetch";
......@@ -91,6 +92,7 @@ std::unordered_map<
{G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
{G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
{G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
{G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}},
{{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
{G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
......@@ -115,6 +115,9 @@ LOAD_OP2(reshape2, CPU, MALI_GPU);
LOAD_OP1(transpose, CPU);
LOAD_OP1(transpose2, CPU);
LOAD_OP1(prior_box, CPU);
......@@ -35,7 +35,7 @@ template <>
void Im2SequenceKernel<CPU, float>::Compute(
const Im2SequenceParam<CPU> &param) const {
const Tensor *in_x = param.Input();
Tensor *out = param.Output();
framework::LoDTensor *out = param.Output();
std::vector<int> kernels = param.Kernels();
......@@ -52,22 +52,31 @@ void Im2SequenceKernel<CPU, float>::Compute(
paddings[2], strides[0]);
int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
paddings[3], strides[1]);
const std::vector<int> dilations({1, 1});
out->mutable_data<float>({batch_size * output_height * output_width,
img_channels * kernels[0] * kernels[1]});
const std::vector<int> dilations({1, 1});
// TODO: verify
auto out_dims = out->dims();
out->Resize({batch_size, out->numel() / batch_size});
for (int i = 0; i < batch_size; i++) {
const Tensor src =
in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
Tensor dst = out->Slice(i, i + 1).Resize(
{output_height, output_width, img_channels, kernels[0], kernels[1]});
math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
f(src, dilations, strides, paddings, &dst);
framework::LoD lod(1);
lod[0].reserve(batch_size + 1);
int offset = 0;
for (int i = 0; i < batch_size; ++i) {
offset += output_height * output_width;
template class Im2SequenceKernel<CPU, float>;
#include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
return true;
template <>
void Transpose2Kernel<CPU, float>::Compute(
const Transpose2Param<CPU> &param) const {
} // namespace operators
} // namespace paddle_mobile
......@@ -29,10 +29,9 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
auto *input_z_data = input_z->data<float>();
int axis = param.Axis();
Tensor *out = param.Out();
auto *out_data = out->mutable_data<float>();
// int m = out->dims()[0];
// int n = out->dims()[1];
auto *out_data = out->mutable_data<float>();
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
......@@ -83,6 +83,7 @@ void PoolCompute(const PoolParam<CPU> &param) {
#if __aarch64__
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
/// todo: fix bug in Pool2x2
if (pooling_type == "max") {
math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
......@@ -24,6 +24,7 @@ void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
Tensor *out = param.Out();
auto x_dims = in_x->dims();
math::SoftmaxFuntor<CPU, float>()(in_x, out);
} // namespace operators
#pragma once
#include <vector>
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void Transpose2Compute(const Transpose2Param<CPU>& param) {
const auto* input_x = param.InputX();
const auto input_x_dims = input_x->dims();
auto* out = param.Out();
const auto axis = param.Axis();
const auto* input_x_data = input_x->data<float>();
auto* out_data = out->mutable_data<float>();
size_t ndim = axis.size();
std::vector<int> xdim(ndim);
std::vector<int> xstride(ndim);
std::vector<int> xout(ndim);
for (int i = 0; i < ndim; i++) {
int j = ndim - 1 - i;
xdim[j] = input_x_dims[axis[i]];
xstride[j] = 1;
for (int k = axis[i] + 1; k < ndim; k++) {
xstride[j] *= input_x_dims[k];
xout[j] = xstride[j] * xdim[j];
auto numel = input_x->numel();
size_t pind = 0;
std::vector<int> ind(ndim);
for (int i = 0; i < numel; i++) {
out_data[i] = input_x_data[pind];
pind += xstride[0];
for (int j = 0; j < ndim - 1; j++) {
if (ind[j] == xdim[j]) {
ind[j + 1]++;
ind[j] = 0;
pind += xstride[j + 1];
pind -= xout[j];
} else {
} // namespace operators
} // namespace paddle_mobile
#pragma once
#include <vector>
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class Transpose2Kernel
: public framework::OpKernelBase<DeviceType, Transpose2Param<DeviceType>> {
void Compute(const Transpose2Param<DeviceType>& param) const;
bool Init(Transpose2Param<DeviceType>* param);
} // namespace operators
} // namespace paddle_mobile
......@@ -257,8 +257,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
const int l = h;
// const int l = h;
const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]);
const int hxw = h * w;
......@@ -271,7 +270,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
vbias = vdupq_n_f32(bias_data[j]);
int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0
int w_mid = w - 2; // l=1->l_mid=-1,l=2->l_mid=0
float w00 = filter_data_tmp[0];
float w01 = filter_data_tmp[1];
float w02 = filter_data_tmp[2];
......@@ -283,39 +282,38 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
float w22 = filter_data_tmp[8];
output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
w20 * input_data[2 * l - 2] +
w21 * input_data[2 * l - 1];
output_data[(l - 1) * l] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
w01 * input_data[(l - 2) * (l + 1) + 1] +
w10 * input_data[l * l - 2] +
w11 * input_data[l * l - 1];
w21 * input_data[w] + w22 * input_data[w + 1];
output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
w20 * input_data[2 * w - 2] +
w21 * input_data[2 * w - 1];
output_data[(h - 1) * w] =
w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
output_data[h * w - 1] =
w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
if (if_bias) {
output_data[0] += bias_data[j];
output_data[l - 1] += bias_data[j];
output_data[(l - 1) * l] += bias_data[j];
output_data[l * l - 1] += bias_data[j];
for (int i = 1; i < l - 1; ++i) {
output_data[i * l] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
w01 * input_data[i * l + l - 1 - l] +
w10 * input_data[i * l + l - 1 - 1] +
w11 * input_data[i * l + l - 1] +
w20 * input_data[i * l + l - 1 + l - 1] +
w21 * input_data[i * l + l - 1 + l];
output_data[w - 1] += bias_data[j];
output_data[(h - 1) * w] += bias_data[j];
output_data[h * w - 1] += bias_data[j];
for (int i = 1; i < h - 1; ++i) {
output_data[i * w] =
w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
w01 * input_data[i * w + w - 1 - w] +
w10 * input_data[i * w + w - 1 - 1] +
w11 * input_data[i * w + w - 1] +
w20 * input_data[i * w + w - 1 + w - 1] +
w21 * input_data[i * w + w - 1 + w];
if (if_bias) {
output_data[i * l] += bias_data[j];
output_data[i * l + l - 1] += bias_data[j];
output_data[i * w] += bias_data[j];
output_data[i * w + w - 1] += bias_data[j];
......@@ -325,15 +323,15 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
tmp3, tmp4, tmp5, out0;
in0 = vld1q_f32(input_tmp);
in2 = vld1q_f32(input_tmp + l);
const float *input_tmp_end = input_tmp + (l - 2) * l;
in2 = vld1q_f32(input_tmp + w);
const float *input_tmp_end = input_tmp + (h - 2) * w;
in4 = vld1q_f32(input_tmp_end);
in6 = vld1q_f32(input_tmp_end + l);
int c_mid = l_mid;
in6 = vld1q_f32(input_tmp_end + w);
int c_mid = w_mid;
auto output_ptr = output_data + 1;
for (; c_mid > 3; c_mid -= 4) {
in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + l + 4);
in3 = vld1q_f32(input_tmp + w + 4);
tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2);
......@@ -352,7 +350,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
vst1q_f32(output_ptr, out0);
in5 = vld1q_f32(input_tmp_end + 4);
in7 = vld1q_f32(input_tmp_end + l + 4);
in7 = vld1q_f32(input_tmp_end + w + 4);
tmp0 = vextq_f32(in4, in5, 1);
tmp1 = vextq_f32(in4, in5, 2);
......@@ -367,7 +365,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
vst1q_f32(output_ptr + (l - 1) * l, out0);
vst1q_f32(output_ptr + (h - 1) * w, out0);
// can optimize to each 8 stride.
input_tmp += 4;
......@@ -380,8 +378,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
// top right pad
float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]);
tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2);
......@@ -409,8 +407,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
// bottom right pad
float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]);
float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]);
tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2);
......@@ -427,28 +425,28 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0);
if (i == 1) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1);
if (i == 2) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2);
// mid
for (int i = 0; i < l - 2; ++i) {
auto output_ptr = output_data + (i + 1) * l + 1;
input_tmp = input_data + i * l;
for (int i = 0; i < h - 2; ++i) {
auto output_ptr = output_data + (i + 1) * w + 1;
input_tmp = input_data + i * w;
auto in0_tmp = vld1q_f32(input_tmp);
auto in2_tmp = vld1q_f32(input_tmp + l);
auto in4_tmp = vld1q_f32(input_tmp + l + l);
c_mid = l_mid;
auto in2_tmp = vld1q_f32(input_tmp + w);
auto in4_tmp = vld1q_f32(input_tmp + w + w);
c_mid = w_mid;
for (; c_mid > 3; c_mid -= 4) {
auto in1_tmp = vld1q_f32(input_tmp + 4);
auto in3_tmp = vld1q_f32(input_tmp + l + 4);
auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
auto in3_tmp = vld1q_f32(input_tmp + w + 4);
auto in5_tmp = vld1q_f32(input_tmp + w + w + 4);
tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
......@@ -477,9 +475,9 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
in4_tmp = in5_tmp;
float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]);
float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]);
tmp0 = vextq_f32(in0_tmp, pad0, 1);
tmp1 = vextq_f32(in0_tmp, pad0, 2);
......@@ -539,8 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
const int hxw = input_height * input_width;
const int l = input_height;
// const int l = input_height;
const int h = input_height;
const int w = input_width;
float32x4_t vzero = vdupq_n_f32(0);
for (int b = 0; b < batch_size; b++) {
......@@ -626,54 +625,53 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
w20 * input_data[2 * l - 2] +
w21 * input_data[2 * l - 1];
output_data[(l - 1) * l] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
w01 * input_data[(l - 2) * (l + 1) + 1] +
w10 * input_data[l * l - 2] +
w11 * input_data[l * l - 1];
w21 * input_data[w] + w22 * input_data[w + 1];
output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
w20 * input_data[2 * w - 2] +
w21 * input_data[2 * w - 1];
output_data[(h - 1) * w] =
w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
output_data[h * w - 1] =
w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c];
output_data[l - 1] =
output_data[l - 1] * newscale_data[c] + newbias_data[c];
output_data[(l - 1) * l] =
output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c];
output_data[l * l - 1] =
output_data[l * l - 1] * newscale_data[c] + newbias_data[c];
output_data[w - 1] =
output_data[w - 1] * newscale_data[c] + newbias_data[c];
output_data[(h - 1) * w] =
output_data[(h - 1) * w] * newscale_data[c] + newbias_data[c];
output_data[h * w - 1] =
output_data[h * w - 1] * newscale_data[c] + newbias_data[c];
if (if_relu) {
output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
output_data[(l - 1) * l] =
output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l];
output_data[l * l - 1] =
output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1];
for (int i = 1; i < l - 1; ++i) {
output_data[i * l] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
w01 * input_data[i * l + l - 1 - l] +
w10 * input_data[i * l + l - 1 - 1] +
w11 * input_data[i * l + l - 1] +
w20 * input_data[i * l + l - 1 + l - 1] +
w21 * input_data[i * l + l - 1 + l];
output_data[i * l] =
output_data[i * l] * newscale_data[c] + newbias_data[c];
output_data[i * l + l - 1] =
output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c];
output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - 1];
output_data[(h - 1) * w] =
output_data[(h - 1) * w] < 0 ? 0 : output_data[(h - 1) * w];
output_data[h * w - 1] =
output_data[h * w - 1] < 0 ? 0 : output_data[h * w - 1];
for (int i = 1; i < h - 1; ++i) {
output_data[i * w] =
w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
w01 * input_data[i * w + w - 1 - w] +
w10 * input_data[i * w + w - 1 - 1] +
w11 * input_data[i * w + w - 1] +
w20 * input_data[i * w + w - 1 + w - 1] +
w21 * input_data[i * w + w - 1 + w];
output_data[i * w] =
output_data[i * w] * newscale_data[c] + newbias_data[c];
output_data[i * w + w - 1] =
output_data[i * w + w - 1] * newscale_data[c] + newbias_data[c];
if (if_relu) {
output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
output_data[i * l + l - 1] =
output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1];
output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i * w];
output_data[i * w + w - 1] =
output_data[i * w + w - 1] < 0 ? 0 : output_data[i * w + w - 1];
......@@ -776,7 +774,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
const int l = h;
// const int l = h;
const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]);
......@@ -792,7 +790,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
vnewbias = vdupq_n_f32(newbias_data[j]);
vnewscale = vdupq_n_f32(newscale_data[j]);
int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0
int w_mid = w - 2; // l=1->l_mid=-1,l=2->l_mid=0
float w00 = filter_data_tmp[0];
float w01 = filter_data_tmp[1];
float w02 = filter_data_tmp[2];
......@@ -804,49 +802,49 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
float w22 = filter_data_tmp[8];
output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l -
1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1];
output_data[(l - 1) * l] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l +
1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
w01 * input_data[(l - 2) * (l + 1) + 1] +
w10 * input_data[l * l - 2] +
w11 * input_data[l * l - 1];
w21 * input_data[w] + w22 * input_data[w + 1];
output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w -
1] + w20 * input_data[2 * w - 2] + w21 * input_data[2 * w - 1];
output_data[(h - 1) * w] =
w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w +
1] + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
output_data[h * w - 1] = w00 * input_data[h*w-w-2] +
w01 * input_data[h*w-w-1] +
w10 * input_data[h * w - 2] +
w11 * input_data[h * w - 1];
output_data[0] = output_data[0] * newscale_data[j] +
newbias_data[j]; output_data[l - 1] = output_data[l - 1] *
newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] =
output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
output_data[l * l - 1] =
output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
newbias_data[j]; output_data[w - 1] = output_data[w - 1] *
newscale_data[j] + newbias_data[j]; output_data[(h - 1) * w] =
output_data[(h - 1) * w] * newscale_data[j] + newbias_data[j];
output_data[h * w - 1] =
output_data[h * w - 1] * newscale_data[j] + newbias_data[j];
if (if_relu) {
output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l -
1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 :
output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1]
< 0 ? 0 : output_data[l * l - 1];
for (int i = 1; i < l - 1; ++i) {
output_data[i * l] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1]
+ w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 *
input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i *
l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i
* l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 *
input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21
* input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l]
* newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] =
output_data[i * l + l - 1] * newscale_data[j] +
output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w -
1]; output_data[(h - 1) * w] = output_data[(h - 1) * w] < 0 ? 0 :
output_data[(h - 1) * w]; output_data[h * w - 1] = output_data[h * w - 1]
< 0 ? 0 : output_data[h * w - 1];
for (int i = 1; i < h - 1; ++i) {
output_data[i * w] =
w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1]
+ w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 *
input_data[i * w + w] + w22 * input_data[i * w + w + 1]; output_data[i *
w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + w01 * input_data[i
* w + w - 1 - w] + w10 * input_data[i * w + w - 1 - 1] + w11 *
input_data[i * w + w - 1] + w20 * input_data[i * w + w - 1 + w - 1] + w21
* input_data[i * w + w - 1 + w]; output_data[i * w] = output_data[i * w]
* newscale_data[j] + newbias_data[j]; output_data[i * w + w - 1] =
output_data[i * w + w - 1] * newscale_data[j] +
if (if_relu) {
output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i
* l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 :
output_data[i * l + l - 1];
output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i
* w]; output_data[i * w + w - 1] = output_data[i * w + w - 1] < 0 ? 0 :
output_data[i * w + w - 1];
......@@ -855,11 +853,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1,
tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 =
vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l -
2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid >
vld1q_f32(input_tmp + w); const float *input_tmp_end = input_tmp + (h -
2) * w; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
w); int c_mid = w_mid; auto output_ptr = output_data + 1; for (; c_mid >
3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 =
vld1q_f32(input_tmp + l + 4);
vld1q_f32(input_tmp + w + 4);
tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2);
......@@ -880,7 +878,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
vst1q_f32(output_ptr, out0);
in5 = vld1q_f32(input_tmp_end + 4);
in7 = vld1q_f32(input_tmp_end + l + 4);
in7 = vld1q_f32(input_tmp_end + w + 4);
tmp0 = vextq_f32(in4, in5, 1);
tmp1 = vextq_f32(in4, in5, 2);
......@@ -897,7 +895,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
if (if_relu) {
out0 = vmaxq_f32(out0, vzero);
vst1q_f32(output_ptr + (l - 1) * l, out0);
vst1q_f32(output_ptr + (h - 1) * w, out0);
// can optimize to each 8 stride.
input_tmp += 4;
......@@ -910,8 +908,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
// top right pad
float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]);
tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2);
......@@ -941,8 +939,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
// bottom right pad
float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]);
float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]);
tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2);
......@@ -961,29 +959,29 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0);
if (i == 1) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1);
if (i == 2) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2);
// mid
for (int i = 0; i < l - 2; ++i) {
auto output_ptr = output_data + (i + 1) * l + 1;
input_tmp = input_data + i * l;
for (int i = 0; i < h - 2; ++i) {
auto output_ptr = output_data + (i + 1) * w + 1;
input_tmp = input_data + i * w;
auto in0_tmp = vld1q_f32(input_tmp);
auto in2_tmp = vld1q_f32(input_tmp + l);
auto in4_tmp = vld1q_f32(input_tmp + l + l);
c_mid = l_mid;
auto in2_tmp = vld1q_f32(input_tmp + w);
auto in4_tmp = vld1q_f32(input_tmp + w + w);
c_mid = w_mid;
for (; c_mid > 3; c_mid -= 4) {
auto in1_tmp = vld1q_f32(input_tmp + 4);
auto in3_tmp = vld1q_f32(input_tmp + l + 4);
auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
auto in3_tmp = vld1q_f32(input_tmp + w + 4);
auto in5_tmp = vld1q_f32(input_tmp + w + w + 4);
tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
......@@ -1014,9 +1012,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
in4_tmp = in5_tmp;
float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]);
float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]);
tmp0 = vextq_f32(in0_tmp, pad0, 1);
tmp1 = vextq_f32(in0_tmp, pad0, 2);
......@@ -1060,6 +1058,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
/// w!=h not fix
void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
......@@ -1275,7 +1274,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
const int in_l = in_h;
const int inhxw = in_h * in_w;
const int outhxw = out_h * out_w;
const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
/// todo : fix if_pad when w != h
const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]);
const float *input_row_ptr;
......@@ -1381,9 +1381,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
if ((w4 != w_times)) {
vst1q_f32(output_row_ptr, res3);
} else {
if (out_l - 2 - w_times * 3 == 1) {
if (out_w - 2 - w_times * 3 == 1) {
vst1q_lane_f32(output_row_ptr, res3, 0);
} else if (out_l - 2 - w_times * 3 == 2) {
} else if (out_w - 2 - w_times * 3 == 2) {
vst1q_lane_f32(output_row_ptr, res3, 0);
vst1q_lane_f32(output_row_ptr + 1, res3, 1);
......@@ -1393,28 +1393,28 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
input_const[in_l] * w21 +
input_const[in_l + 1] * w22;
input_const[in_w] * w21 +
input_const[in_w + 1] * w22;
out2in_mid = (out_l - 1) * 2;
output_data_tmp[out_l - 1] =
out2in_mid = (out_w - 1) * 2;
output_data_tmp[out_w - 1] =
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
w20 * input_const[out2in_mid + in_w - 1] +
w21 * input_const[out2in_mid + in_w] +
(1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w;
out2in_mid = (out_h - 1) * 2 * in_w;
output_data_tmp[out_l * (out_l - 1)] =
output_data_tmp[out_w * (out_h - 1)] =
w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
(1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[out_l * out_l - 1] =
output_data_tmp[out_h * out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
......@@ -1425,21 +1425,21 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
w22 * input_const[out2in_mid + in_w + 1]);
if (if_bias) {
output_data_tmp[0] += bias_data[j];
output_data_tmp[out_l - 1] += bias_data[j];
output_data_tmp[out_l * (out_l - 1)] += bias_data[j];
output_data_tmp[out_l * out_l - 1] += bias_data[j];
output_data_tmp[out_w - 1] += bias_data[j];
output_data_tmp[out_w * (out_h - 1)] += bias_data[j];
output_data_tmp[out_h * out_w - 1] += bias_data[j];
for (int i = 1; i < out_h - 1; i++) {
out2in_mid = i * 2 * in_w;
output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] +
w12 * input_const[out2in_mid + 1] +
w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1];
out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
output_data_tmp[i * out_l + out_l - 1] =
out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[i * out_w + out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
......@@ -1449,8 +1449,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]);
if (if_bias) {
output_data_tmp[i * out_l] += bias_data[j];
output_data_tmp[i * out_l + out_l - 1] += bias_data[j];
output_data_tmp[i * out_w] += bias_data[j];
output_data_tmp[i * out_w + out_w - 1] += bias_data[j];
filter_data_tmp += 9;
......@@ -1657,11 +1657,12 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
const int in_w = static_cast<int>(input->dims()[3]);
const int out_h = static_cast<int>(output->dims()[2]);
const int out_w = static_cast<int>(output->dims()[3]);
const int out_l = out_h;
const int in_l = in_h;
// const int out_l = out_h;
// const int in_l = in_h;
const int inhxw = in_h * in_w;
const int outhxw = out_h * out_w;
const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
/// todo : fix if_pad when w != h
const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]);
const int w_times = (out_w - 2) / 3;
......@@ -1775,9 +1776,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
vst1q_lane_f32(output_row_ptr + 1, res3, 1);
vst1q_lane_f32(output_row_ptr + 2, res3, 2);
} else {
if (out_l - 2 - w_times * 3 == 1) {
if (out_w - 2 - w_times * 3 == 1) {
vst1q_lane_f32(output_row_ptr, res3, 0);
} else if (out_l - 2 - w_times * 3 == 2) {
} else if (out_w - 2 - w_times * 3 == 2) {
vst1q_lane_f32(output_row_ptr, res3, 0);
vst1q_lane_f32(output_row_ptr + 1, res3, 1);
......@@ -1787,28 +1788,28 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
input_const[in_l] * w21 +
input_const[in_l + 1] * w22;
input_const[in_w] * w21 +
input_const[in_w + 1] * w22;
out2in_mid = (out_l - 1) * 2;
output_data_tmp[out_l - 1] =
out2in_mid = (out_w - 1) * 2;
output_data_tmp[out_w - 1] =
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
w20 * input_const[out2in_mid + in_w - 1] +
w21 * input_const[out2in_mid + in_w] +
(1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w;
out2in_mid = (out_h - 1) * 2 * in_w;
output_data_tmp[out_l * (out_l - 1)] =
output_data_tmp[out_w * (out_h - 1)] =
w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
(1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[out_l * out_l - 1] =
output_data_tmp[out_h * out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
......@@ -1819,38 +1820,38 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
w22 * input_const[out2in_mid + in_w + 1]);
output_data_tmp[0] =
output_data_tmp[0] * newscale_data[j] + newbias_data[j];
output_data_tmp[out_l - 1] =
output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j];
output_data_tmp[out_l * (out_l - 1)] =
output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] +
output_data_tmp[out_w - 1] =
output_data_tmp[out_w - 1] * newscale_data[j] + newbias_data[j];
output_data_tmp[out_w * (out_h - 1)] =
output_data_tmp[out_w * (out_h - 1)] * newscale_data[j] +
output_data_tmp[out_l * out_l - 1] =
output_data_tmp[out_l * out_l - 1] * newscale_data[j] +
output_data_tmp[out_h * out_w - 1] =
output_data_tmp[out_h * out_w - 1] * newscale_data[j] +
if (if_relu) {
output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0];
output_data_tmp[out_l - 1] =
output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1];
output_data_tmp[out_l * (out_l - 1)] =
output_data_tmp[out_l * (out_l - 1)] < 0
output_data_tmp[out_w - 1] =
output_data_tmp[out_w - 1] < 0 ? 0 : output_data_tmp[out_w - 1];
output_data_tmp[out_w * (out_h - 1)] =
output_data_tmp[out_w * (out_h - 1)] < 0
? 0
: output_data_tmp[out_l * (out_l - 1)];
output_data_tmp[out_l * out_l - 1] =
output_data_tmp[out_l * out_l - 1] < 0
: output_data_tmp[out_w * (out_h - 1)];
output_data_tmp[out_h * out_w - 1] =
output_data_tmp[out_h * out_w - 1] < 0
? 0
: output_data_tmp[out_l * out_l - 1];
: output_data_tmp[out_h * out_w - 1];
for (int i = 1; i < out_h - 1; i++) {
out2in_mid = i * 2 * in_w;
output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] +
w12 * input_const[out2in_mid + 1] +
w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1];
out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
output_data_tmp[i * out_l + out_l - 1] =
out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[i * out_w + out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
......@@ -1859,18 +1860,18 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
(1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]);
output_data_tmp[i * out_l] =
output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j];
output_data_tmp[i * out_l + out_l - 1] =
output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] +
output_data_tmp[i * out_w] =
output_data_tmp[i * out_w] * newscale_data[j] + newbias_data[j];
output_data_tmp[i * out_w + out_w - 1] =
output_data_tmp[i * out_w + out_w - 1] * newscale_data[j] +
if (if_relu) {
output_data_tmp[i * out_l] =
output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l];
output_data_tmp[i * out_l + out_l - 1] =
output_data_tmp[i * out_l + out_l - 1] < 0
output_data_tmp[i * out_w] =
output_data_tmp[i * out_w] < 0 ? 0 : output_data_tmp[i * out_w];
output_data_tmp[i * out_w + out_w - 1] =
output_data_tmp[i * out_w + out_w - 1] < 0
? 0
: output_data_tmp[i * out_l + out_l - 1];
: output_data_tmp[i * out_w + out_w - 1];
......@@ -53,7 +53,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
(((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
int fill = isize % 2;
if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
dilation[0] == 1 && im_height > 2) {
dilation[0] == 1 && im_height > 2 && im_height == im_width) {
for (int c = 0; c < im_channels; ++c) {
int oosize = osize * osize;
int nk4 = osize / 4;
......@@ -225,7 +225,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
im_data += isize * isize;
} else if (stride[0] == 2 && filter_height == 3 && pad1 && dilation[0] == 1 &&
im_height > 2) {
im_height > 2 && im_height == im_width) {
for (int c = 0; c < im_channels; ++c) {
int oosize = osize * osize;
int nk4 = osize / 4;
......@@ -605,7 +605,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
const T *im_data = im.data<T>();
T *col_data = col->data<T>();
for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
for (int channel = 0; channel < im_channels; ++channel) {
......@@ -617,7 +616,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
++filter_col_idx) {
int im_col_offset =
col_col_idx * stride[1] + filter_col_idx - padding[1];
int col_offset =
((((col_row_idx)*col_width + col_col_idx) * im_channels +
channel) *
......@@ -625,7 +623,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
filter_row_idx) *
filter_width +
int im_offset = (channel * im_height + im_row_offset) * im_width +
col_data[col_offset] =
......@@ -58,7 +58,7 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
const float *in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width;
const float *in_ptr2 = in_ptr1 + input_width;
if (ph + 1 >= input_height) {
if (ph != input_height && ph + 1 >= input_height) {
in_ptr2 = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * input_width));
memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
......@@ -122,19 +122,30 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
if (_w2 != 0) {
in_ptr1 += 16 * w1 + 4 * w2;
in_ptr2 += 16 * w1 + 4 * w2;
out_ptr += 8 * w1 + 2 * w2;
in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width + 16 * w1 +
4 * w2;
in_ptr2 = in_ptr1 + input_width;
out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
2 * w2;
if (_w2 == 1) {
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
} else if (_w2 == 2) {
float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
*out_ptr = (temp > temp1) ? temp : temp1;
} else if (_w2 == 3) {
float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
*out_ptr++ = (temp > temp1) ? temp : temp1;
float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
*out_ptr = (temp > temp1) ? temp : temp1;
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
......@@ -173,7 +184,7 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
int w2 = _w1 / 4;
int _w2 = _w1 % 4;
float quarter = 1 / 4;
float quarter = 0.25;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < input_height; ph += 2) {
......@@ -250,25 +261,32 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
if (_w2 != 0) {
in_ptr1 += 16 * w1 + 4 * w2;
in_ptr2 += 16 * w1 + 4 * w2;
out_ptr += 8 * w1 + 2 * w2;
in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width + 16 * w1 +
4 * w2;
in_ptr2 = in_ptr1 + input_width;
out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
2 * w2;
if (_w2 == 1) {
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
} else if (_w2 == 2) {
float temp = 0;
temp += *in_ptr1++;
temp += *in_ptr2++;
temp += *in_ptr1;
temp += *in_ptr2;
*out_ptr = 0.5 * temp;
temp += *in_ptr1;
temp += *in_ptr2;
*out_ptr = 0.25 * temp;
} else if (_w2 == 3) {
float temp = 0;
temp += *in_ptr1++;
temp += *in_ptr2++;
temp += *in_ptr1++;
temp += *in_ptr2++;
*out_ptr++ = 0.5 * temp;
*out_ptr = 0.25 * temp;
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
......@@ -1132,6 +1132,37 @@ class TransposeParam : public OpParam {
template <typename Dtype>
class Transpose2Param : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
output_xshape_ = OutputXShapeFrom<GType>(outputs, scope);
axis_ = GetAttr<vector<int>>("axis", attrs);
const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; }
RType *OutputXShape() const { return output_xshape_; }
const vector<int> &Axis() const { return axis_; }
RType *input_x_;
RType *out_;
RType *output_xshape_;
vector<int> axis_;
#ifdef LOOKUP_OP
template <typename Dtype>
class LookupParam : public OpParam {
......@@ -2116,9 +2147,9 @@ class Im2SequenceParam : public OpParam {
paddings_ = GetAttr<vector<int>>("paddings", attrs);
const RType *Input() const { return input_x_; }
const GType *Input() const { return input_x_; }
RType *Output() const { return out_; }
GType *Output() const { return out_; }
const vector<int> &Kernels() const { return kernels_; }
......@@ -2127,8 +2158,8 @@ class Im2SequenceParam : public OpParam {
const vector<int> &Paddings() const { return paddings_; }
RType *input_x_;
RType *out_;
GType *input_x_;
GType *out_;
vector<int> kernels_;
vector<int> strides_;
vector<int> paddings_;
#include <vector>
#include "common/enforce.h"
#include "operators/transpose2_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void Transpose2Op<Dtype, T>::InferShape() const {
auto input_x_dims = this->param_.InputX()->dims();
auto axis = this->param_.Axis();
size_t x_dims_size = input_x_dims.size();
size_t axis_size = axis.size();
PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
"input_dims must "
"be equal to the axis_size. ")
std::vector<int> count(axis_size, 0);
for (size_t i = 0; i < axis_size; i++) {
axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
"Each element of Attribute axis should be a unique value "
"range from 0 to (dims - 1), "
"where the dims is the axis's size");
framework::DDim out_dims(input_x_dims);
for (size_t i = 0; i < axis_size; i++) {
out_dims[i] = input_x_dims[axis[i]];
std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
for (int i = 0; i < input_x_dims.size(); ++i) {
xshape_dims[i + 1] = input_x_dims[i];
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
#endif // TRANSPOSE_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/transpose2_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T>
class Transpose2Op : public framework::OperatorWithKernel<
DeviceType, Transpose2Param<DeviceType>,
operators::Transpose2Kernel<DeviceType, T>> {
Transpose2Op(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, Transpose2Param<DeviceType>,
operators::Transpose2Kernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, Transpose2Param<DeviceType>,
operators::Transpose2Kernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
} // namespace operators
} // namespace paddle_mobile
......@@ -184,6 +184,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
target_link_libraries(test-transpose-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
target_link_libraries(test-transpose2-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
target_link_libraries(test-multiclassnms-op paddle-mobile)
......@@ -343,6 +347,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
target_link_libraries(test-multi-process paddle-mobile)
# gen test
ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
target_link_libraries(test-eng paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif ()
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
// paddle_mobile.SetThreadNum(4);
auto time1 = time();
if (paddle_mobile.Load(std::string(g_eng) + "/model",
std::string(g_eng) + "/params", true, false, 1,
true)) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<int64_t> dims{1, 1, 48, 400};
LoDTensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 1, 48, 400}, static_cast<float>(0),
std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
// 预热十次
for (int i = 0; i < 1; ++i) {
auto time3 = time();
for (int i = 0; i < 1; ++i) {
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
#include "operators/batchnorm_op.h"
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/box_coder_op.h"
......@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/elementwise_sub_op.h"
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/fill_constant_op.h"
......@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <framework/program/program-optimize/program_optimize.h>
#include "../test_include.h"
#include "operators/fusion_fc_op.h"
......@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/im2sequence_op.h"
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/multiclass_nms_op.h"
......@@ -31,14 +30,12 @@ class TestMultiClassNMSOp {
const std::vector<std::shared_ptr<BlockDesc>> blocks =
// DLOG << " **block size " << blocks.size();
for (auto block_desc : blocks) {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
// DLOG << " ops " << ops.size();
for (auto op : ops) {
if (op->Type() == "multiclass_nms" &&
op->Input("BBoxes")[0] == "box_coder_0.tmp_0") {
DLOG << " mul attr size: " << op->GetAttrMap().size();
DLOG << " attr size: " << op->GetAttrMap().size();
DLOG << " inputs size: " << op->GetInputs().size();
DLOG << " outputs size: " << op->GetOutputs().size();
DLOG << " BBoxes is : " << op->Input("BBoxes")[0];
......@@ -55,14 +52,6 @@ class TestMultiClassNMSOp {
<< op->GetAttrMap().at("nms_top_k").Get<int>();
DLOG << " score_threshold : "
<< op->GetAttrMap().at("score_threshold").Get<float>();
// DLOG << " variances : " <<
// op->GetAttrMap().at("variances").Get<std::vector<float>>();
// DLOG << " aspect_ratios : " <<
// op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
// DLOG << " min_sizes : " <<
// op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
// DLOG << " max_sizes : " <<
// op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
std::shared_ptr<operators::MultiClassNMSOp<Dtype, float>> priorbox =
std::make_shared<operators::MultiClassNMSOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(),
......@@ -88,16 +77,12 @@ class TestMultiClassNMSOp {
auto *output_tensor = output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>({1917, 6});
// DLOG << typeid(output_tensor).name();
// DLOG << "output_tensor dims: " << output_tensor->dims();
std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
predict(t1, t2, 0);
return out_tensor;
// return outvars_tensor;
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/polygon_box_transform_op.h"
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/prior_box_op.h"
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/reshape2_op.h"
......@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/sum_op.h"
#include "../test_include.h"
#include "operators/transpose2_op.h"
namespace paddle_mobile {
namespace framework {
template <typename Dtype>
class TestTranspose2Op {
explicit TestTranspose2Op(const Program<Dtype> p) : program_(p) {
if (use_optimize_) {
to_predict_program_ = program_.optimizeProgram;
} else {
to_predict_program_ = program_.originProgram;
const std::vector<std::shared_ptr<BlockDesc>> blocks =
for (auto block_desc : blocks) {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (auto op : ops) {
if (op->Type() == "transpose2") {
DLOG << " attr size: " << op->GetAttrMap().size();
std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
for (std::unordered_map<std::string, Attribute>::iterator it =
it != attrs.end(); ++it) {
DLOG << " " << it->first << " " << it->second;
DLOG << " inputs size: " << op->GetInputs().size();
VariableNameMap inputs = op->GetInputs();
for (VariableNameMap::iterator it = inputs.begin();
it != inputs.end(); ++it) {
DLOG << " " << it->first << " " << it->second;
DLOG << " outputs size: " << op->GetOutputs().size();
VariableNameMap outputs = op->GetOutputs();
for (VariableNameMap::iterator it = outputs.begin();
it != outputs.end(); ++it) {
DLOG << " " << it->first << " " << it->second;
input_var_name = op->Input("X")[0];
output_var_name = op->Output("Out")[0];
std::shared_ptr<operators::Transpose2Op<Dtype, float>> op_ptr =
std::make_shared<operators::Transpose2Op<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), program_.scope);
std::shared_ptr<Tensor> predict(const Tensor &t) {
auto scope = program_.scope;
Variable *input_feed_value = scope->Var(input_var_name);
auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
Variable *output = scope->Var(output_var_name);
auto *output_tensor = output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>({1, 2, 8});
std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
predict(t, 0);
return out_tensor;
const framework::Program<Dtype> program_;
std::shared_ptr<ProgramDesc> to_predict_program_;
bool use_optimize_ = false;
string input_var_name;
string output_var_name;
void predict(const Tensor &t, int block_id) {
std::shared_ptr<BlockDesc> to_predict_block =
for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
auto op = ops_of_block_[*to_predict_block.get()][j];
template class TestTranspose2Op<CPU>;
} // namespace framework
} // namespace paddle_mobile
int main() {
DLOG << "----------**********----------";
DLOG << "begin to run Transpose2 Test";
paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_ocr) + "/model",
std::string(g_ocr) + "/params");
paddle_mobile::framework::Tensor input;
SetupTensor<float>(&input, {1, 8, 2}, static_cast<float>(0),
auto *input_ptr = input.data<float>();
for (int i = 0; i < 16; ++i) {
*(input_ptr + i) = i;
DLOG << "input : ";
for (int i = 0; i < input.numel(); ++i) {
DLOG << " index " << i << " : " << input_ptr[i];
auto output = testTranspose2Op.predict(input);
auto *output_ptr = output->data<float>();
DLOG << "output : ";
for (int i = 0; i < output->numel(); ++i) {
DLOG << " index " << i << " : " << output_ptr[i];
return 0;
......@@ -205,6 +205,7 @@ if(NOT FOUND_MATCH)
......@@ -251,6 +252,7 @@ endif()
# option(SIGMOID_OP "" ON)
# option(SOFTMAX_OP "" ON)
# option(TRANSPOSE_OP "" ON)
# option(TRANSPOSE2_OP "" ON)
# endif ()
......@@ -328,6 +330,9 @@ endif()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册