提交 4d05a8c6 编写于 作者: H hjchen2

Transform kernel in op initialization

上级 3076c54f
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <map> #include <map>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "common/enforce.h" #include "common/enforce.h"
...@@ -119,10 +120,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> { ...@@ -119,10 +120,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
virtual void InferShape() const = 0; virtual void InferShape() const = 0;
void Init() { void Init() {
// for (auto i : this->inputs_) {
// DLOG << i.first;
// DLOG << i.second;
// }
PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), " %s kernel init failed", PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), " %s kernel init failed",
this->type_.c_str()); this->type_.c_str());
} }
......
...@@ -135,22 +135,6 @@ class Tensor { ...@@ -135,22 +135,6 @@ class Tensor {
return reinterpret_cast<T *>(mutable_data(typeid(T))); return reinterpret_cast<T *>(mutable_data(typeid(T)));
} }
#ifdef PADDLE_MOBILE_DEBUG
template <typename T>
inline void dump(std::string filename) const {
const T *dataptr = data<T>();
std::ofstream out(filename.c_str());
for (int i = 0; i < numel(); ++i) {
out << dataptr[i] << " ";
}
out << "形状:";
for (int j = 0; j < dims_.size(); ++j) {
out << dims_[j] << " ";
}
out.close();
}
#endif
inline void *mutable_data(std::type_index type) { inline void *mutable_data(std::type_index type) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
holder_->set_type(type); holder_->set_type(type);
......
...@@ -17,17 +17,69 @@ limitations under the License. */ ...@@ -17,17 +17,69 @@ limitations under the License. */
#include "operators/kernel/conv_kernel.h" #include "operators/kernel/conv_kernel.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h" #include "operators/kernel/central-arm-func/conv_arm_func.h"
#include <iostream>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) { bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
if (param->Input()->type() == typeid(int8_t)) {
param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_INT8;
} else {
if (param->Groups() == param->Input()->dims()[1] &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1) {
param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT;
} else if (param->Groups() == param->Input()->dims()[1] &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3_FLOAT;
} else if (param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Strides()[0] == param->Strides()[1] &&
param->Dilations()[0] == param->Dilations()[1] &&
param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1 &&
param->Dilations()[0] == 1 && param->Output()->dims()[1] >= 16 &&
param->Input()->dims()[2] >= 16) {
param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
// transform weight
framework::Tensor *transformed_weight = new framework::Tensor;
operators::math::winograd_transform_weight<8, 3>(*param->Filter(),
transformed_weight);
param->Filter() = transformed_weight;
} else {
param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;
}
}
return true; return true;
} }
template <> template <>
void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) const { void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) const {
ConvCompute<float>(param); switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_GEMM_INT8:
GemmConv<int8_t, int32_t>(param);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT:
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
nullptr, false);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3_FLOAT:
math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
param.Filter(), nullptr, param.Output(), false);
break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param);
break;
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
} }
template class ConvKernel<CPU, float>; template class ConvKernel<CPU, float>;
......
...@@ -22,14 +22,14 @@ limitations under the License. */ ...@@ -22,14 +22,14 @@ limitations under the License. */
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/pad.h" #include "operators/math/pad.h"
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd.h" #include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
inline void ConvBasic(const ConvParam<CPU> &param) { inline void GemmConv(const ConvParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor *output = param.Output(); Tensor *output = param.Output();
...@@ -117,9 +117,10 @@ inline void ConvBasic(const ConvParam<CPU> &param) { ...@@ -117,9 +117,10 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
} }
} }
inline void BatchConv3x3Winograd(const ConvParam<CPU> &param) { template <int tile, int kernel>
inline void WinogradConv3x3(const ConvParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor *filter = param.Filter(); const Tensor *filter = param.Filter();
Tensor *output = param.Output(); Tensor *output = param.Output();
output->mutable_data<float>(); output->mutable_data<float>();
int batch_size = input->dims()[0]; int batch_size = input->dims()[0];
...@@ -127,51 +128,40 @@ inline void BatchConv3x3Winograd(const ConvParam<CPU> &param) { ...@@ -127,51 +128,40 @@ inline void BatchConv3x3Winograd(const ConvParam<CPU> &param) {
const std::vector<int> &paddings = param.Paddings(); const std::vector<int> &paddings = param.Paddings();
math::PadFunctor<CPU, float> pad; math::PadFunctor<CPU, float> pad;
auto winograd_pad = [&](int width, int pad) {
int output_tile = tile - kernel + 1;
// int tiles = (width + pad - kernel) / output_tile + 1;
// return (tiles - 1) * output_tile + tile - width;
int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
return pad_width + tile - width;
};
Tensor input_pad; Tensor input_pad;
framework::Tensor transformed_input;
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
Tensor in_batch = input->Slice(i, i + 1); Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1); Tensor out_batch = output->Slice(i, i + 1);
if (paddings[0] == 0 && paddings[1] == 0) { int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
input_pad = in_batch; int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
} else { if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
framework::DDim pad_shape = in_batch.dims(); framework::DDim pad_shape = in_batch.dims();
pad_shape[2] += 2 * paddings[0]; pad_shape[2] += paddings[0] + pad_bottom;
pad_shape[3] += 2 * paddings[1]; pad_shape[3] += paddings[1] + pad_right;
input_pad.mutable_data<float>(pad_shape); input_pad.mutable_data<float>(pad_shape);
pad(in_batch, paddings[0], paddings[0], paddings[1], paddings[1], pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
&input_pad); &input_pad);
}
math::winograd_f6k3(input_pad, *filter, &out_batch);
}
}
template <typename P>
void ConvCompute(const ConvParam<CPU> &param) {
if (param.Input()->type() == typeid(int8_t)) {
ConvBasic<int8_t, int32_t>(param);
} else {
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
nullptr, false);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3) {
math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
param.Filter(), nullptr, param.Output(), false);
} else if (param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Strides()[0] == param.Strides()[1] &&
param.Dilations()[0] == param.Dilations()[1] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
param.Dilations()[0] == 1 && param.Output()->dims()[1] >= 16 &&
param.Output()->dims()[2] >= 16) {
BatchConv3x3Winograd(param);
} else { } else {
ConvBasic<float, float>(param); input_pad = in_batch;
} }
#if __aarch64__
// TODO(hjchen2)
#else
// tile input and transform
math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
// caculate output
math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
output);
#endif
} }
} }
......
...@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) { ...@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
Bias, false); Bias, false);
} else { } else {
ConvBasic<float, float>(param); GemmConv<float, float>(param);
} }
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#include "operators/math/winograd/winograd.h"
#include "operators/math/winograd/winograd_transform.h"
namespace paddle_mobile {
namespace operators {
namespace math {
// F(2X2, 3X3)
void winograd_f2k3(const framework::Tensor &input,
const framework::Tensor &weight, framework::Tensor *output) {
}
// F(6X6, 3X3)
void winograd_f6k3(const framework::Tensor &input,
const framework::Tensor &weight, framework::Tensor *output) {
framework::Tensor transformed_input;
framework::Tensor transformed_weight;
#if __aarch64__
// TODO(hjchen2)
#else
// transform weight
winograd_transform_weight<8, 3>(weight, &transformed_weight);
// tile input and transform
winograd_transform_input<8, 3>(input, &transformed_input);
// caculate output
winograd_transform_output<8, 3>(transformed_input, transformed_weight,
output);
#endif
}
// F(4X4, 5X5)
void winograd_f4k5(const framework::Tensor &input,
const framework::Tensor &weight, framework::Tensor *output) {
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
// F(2X2, 3X3)
void winograd_f2k3(const framework::Tensor &input,
const framework::Tensor &weight, framework::Tensor *output);
// F(6X6, 3X3)
void winograd_f6k3(const framework::Tensor &input,
const framework::Tensor &weight, framework::Tensor *output);
// F(4X4, 5X5)
void winograd_f4k5(const framework::Tensor &input,
const framework::Tensor &weight, framework::Tensor *output);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -40,6 +40,7 @@ void winograd_transform_weight<8, 3>(const framework::Tensor &weight, ...@@ -40,6 +40,7 @@ void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
* w7 = g2 * w7 = g2
*/ */
// weight shape is [out_channel, in_channel, kernel_h, kernel_w] // weight shape is [out_channel, in_channel, kernel_h, kernel_w]
// package weight into [roundup(out_channel/4), 64, in_channel, 4] tiles
int out_channel = weight.dims()[0]; int out_channel = weight.dims()[0];
int in_channel = weight.dims()[1]; int in_channel = weight.dims()[1];
// reshape and alloc transformed weight // reshape and alloc transformed weight
...@@ -322,12 +323,12 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -322,12 +323,12 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
* x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5) * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
* x7 = (d7 - d1) + (d3 - d5) * 5.25 * x7 = (d7 - d1) + (d3 - d5) * 5.25
*/ */
// pack input to [8 * roundup(h/6), 8 * roundup(w/6), channel] tiles // package input into [roundup(tiles/8), 64, channel, 8] tiles
int channel = input.dims()[1]; int channel = input.dims()[1];
int height = input.dims()[2]; int height = input.dims()[2];
int width = input.dims()[3]; int width = input.dims()[3];
int h_tiles = (height + 3) / 6; // (height + 5 - 2) / 6 int h_tiles = (height + 3) / 6; // (height - 8 + 5 + 6) / 6
int w_tiles = (width + 3) / 6; // (width + 5 - 2) / 6 int w_tiles = (width + 3) / 6; // (width - 8 + 5 + 6) / 6
int tiles = (h_tiles * w_tiles + 7) / 8; int tiles = (h_tiles * w_tiles + 7) / 8;
framework::DDim transformed_shape = framework::DDim transformed_shape =
framework::make_ddim(std::vector<int>{tiles, 64, channel, 8}); framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
...@@ -335,29 +336,11 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -335,29 +336,11 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
memset(outptr, 0, output->numel() * sizeof(float)); memset(outptr, 0, output->numel() * sizeof(float));
const float *inptr = input.data<float>(); const float *inptr = input.data<float>();
int inter_h = (height - 2) / 6;
int inter_w = (width - 2) / 6;
int remain_h = height - (inter_h * 6);
int remain_w = width - (inter_w * 6);
framework::Tensor input_pad;
if (remain_h > 2 || remain_w > 2) {
inter_h += (remain_h > 2);
inter_w += (remain_w > 2);
height = (inter_h - 1) * 6 + 8;
width = (inter_w - 1) * 6 + 8;
framework::DDim input_shape =
framework::make_ddim(std::vector<int>{1, channel, height, width});
PadFunctor<CPU, float> pad;
inptr = input_pad.mutable_data<float>(input_shape);
pad(input, 0, height - input.dims()[2], 0, width - input.dims()[3],
&input_pad);
}
size_t image_size = height * width; size_t image_size = height * width;
const float transform_matrix[8] = {5.25f, -5.f, -4.25f, -2.5f, const float transform_matrix[8] = {5.25f, -5.f, -4.25f, -2.5f,
2.f, -1.25f, 0.5f, 0.25f}; 2.f, -1.25f, 0.5f, 0.25f};
int remain_c_start = channel & 0xFFFC; int remain_c_start = channel & 0xFFFC;
#if 0 #if 1
remain_c_start = 0; remain_c_start = 0;
#else #else
#pragma omp parallel for #pragma omp parallel for
...@@ -381,14 +364,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -381,14 +364,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
"vld1.32 {d8-d11}, [%[in1]], %[steps] \n" "vld1.32 {d8-d11}, [%[in1]], %[steps] \n"
"vld1.32 {d12-d15}, [%[in2]], %[steps] \n" "vld1.32 {d12-d15}, [%[in2]], %[steps] \n"
"vld1.32 {d16-d19}, [%[in3]], %[steps] \n" "vld1.32 {d16-d19}, [%[in3]], %[steps] \n"
"vtrn.32 q2, q4 \n" // d0: q2, q2 "vtrn.32 q2, q4 \n" // d0: q2
"vtrn.32 q3, q5 \n" // d1: q4, q3 "vtrn.32 q3, q5 \n" // d1: q4
"vtrn.32 q6, q8 \n" // d2: q6, q4 "vtrn.32 q6, q8 \n" // d2: q6
"vtrn.32 q7, q9 \n" // d3: q8, q5 "vtrn.32 q7, q9 \n" // d3: q8
"vswp.32 d5, d12 \n" // d4: q3, q6 "vswp.32 d5, d12 \n" // d4: q3
"vswp.32 d9, d16 \n" // d5: q5, q7 "vswp.32 d9, d16 \n" // d5: q5
"vswp.32 d7, d14 \n" // d6: q7, q8 "vswp.32 d7, d14 \n" // d6: q7
"vswp.32 d11, d18 \n" // d7: q9, q9 "vswp.32 d11, d18 \n" // d7: q9
"vsub.f32 q10, q2, q7 \n" "vsub.f32 q10, q2, q7 \n"
"vsub.f32 q11, q3, q6 \n" "vsub.f32 q11, q3, q6 \n"
...@@ -680,14 +663,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -680,14 +663,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
"vld1.32 {d8-d11}, [%[in1]], %[steps] \n" "vld1.32 {d8-d11}, [%[in1]], %[steps] \n"
"vld1.32 {d12-d15}, [%[in2]], %[steps] \n" "vld1.32 {d12-d15}, [%[in2]], %[steps] \n"
"vld1.32 {d16-d19}, [%[in3]], %[steps] \n" "vld1.32 {d16-d19}, [%[in3]], %[steps] \n"
"vtrn.32 q2, q4 \n" // d0: q2, q2 "vtrn.32 q2, q4 \n" // d0: q2
"vtrn.32 q3, q5 \n" // d1: q4, q3 "vtrn.32 q3, q5 \n" // d1: q4
"vtrn.32 q6, q8 \n" // d2: q6, q4 "vtrn.32 q6, q8 \n" // d2: q6
"vtrn.32 q7, q9 \n" // d3: q8, q5 "vtrn.32 q7, q9 \n" // d3: q8
"vswp.32 d5, d12 \n" // d4: q3, q6 "vswp.32 d5, d12 \n" // d4: q3
"vswp.32 d9, d16 \n" // d5: q5, q7 "vswp.32 d9, d16 \n" // d5: q5
"vswp.32 d7, d14 \n" // d6: q7, q8 "vswp.32 d7, d14 \n" // d6: q7
"vswp.32 d11, d18 \n" // d7: q9, q9 "vswp.32 d11, d18 \n" // d7: q9
"vsub.f32 q10, q2, q7 \n" "vsub.f32 q10, q2, q7 \n"
"vsub.f32 q11, q3, q6 \n" "vsub.f32 q11, q3, q6 \n"
...@@ -749,11 +732,12 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -749,11 +732,12 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
float *ptr0 = d_bt; float *ptr0 = d_bt;
float *ptr1 = ptr0 + 32; float *ptr1 = ptr0 + 32;
int tile_id = h * w_tiles + w; int tile_indics = h * w_tiles + w;
int block_id = tile_id >> 3; int tile_block = tile_indics >> 3;
int pack_id = tile_id & 0x7; int block_indics = tile_indics & 0x7;
// (tiles / 8, 64, channel, 8) // (tiles / 8, 64, channel, 8)
float *out0 = outptr + (block_id * 64 * channel + c) * 8 + pack_id; float *out0 =
outptr + (tile_block * 64 * channel + c) * 8 + block_indics;
float *out1 = out0 + channel * 8; float *out1 = out0 + channel * 8;
float *out2 = out1 + channel * 8; float *out2 = out1 + channel * 8;
float *out3 = out2 + channel * 8; float *out3 = out2 + channel * 8;
...@@ -771,7 +755,6 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -771,7 +755,6 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
"vld1.32 {d8-d11}, [%[ptr0]]! \n" // q4: d2, q5: d3 "vld1.32 {d8-d11}, [%[ptr0]]! \n" // q4: d2, q5: d3
"vld1.32 {d12-d15}, [%[ptr1]]! \n" // q6: d4, q7: d5 "vld1.32 {d12-d15}, [%[ptr1]]! \n" // q6: d4, q7: d5
"vld1.32 {d16-d19}, [%[ptr1]]! \n" // q8: d6, q9: d7 "vld1.32 {d16-d19}, [%[ptr1]]! \n" // q8: d6, q9: d7
"vtrn.32 q2, q3 \n" "vtrn.32 q2, q3 \n"
"vtrn.32 q4, q5 \n" "vtrn.32 q4, q5 \n"
"vtrn.32 q6, q7 \n" "vtrn.32 q6, q7 \n"
...@@ -918,7 +901,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -918,7 +901,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"cmp %[inter_channel], #0 \n" "cmp %[inter_channel], #0 \n"
"ble cmp_remain_%= \n" "ble cmp_remain_%= \n"
"loop_4c_%=: \n" "loop_2c_%=: \n"
"vld1.32 {d0-d3}, [%[w_ptr]]! \n" "vld1.32 {d0-d3}, [%[w_ptr]]! \n"
"vld1.32 {d4-d7}, [%[in_ptr]]! \n" "vld1.32 {d4-d7}, [%[in_ptr]]! \n"
"vmla.f32 q8, q2, d0[0] \n" "vmla.f32 q8, q2, d0[0] \n"
...@@ -941,7 +924,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -941,7 +924,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"vmla.f32 q15, q5, d3[1] \n" "vmla.f32 q15, q5, d3[1] \n"
"subs %[inter_channel], #1 \n" "subs %[inter_channel], #1 \n"
"bne loop_4c_%= \n" "bne loop_2c_%= \n"
// cmp remain channel > 0 // cmp remain channel > 0
"cmp_remain_%=: \n" "cmp_remain_%=: \n"
......
...@@ -379,9 +379,9 @@ class ConvParam : public OpParam { ...@@ -379,9 +379,9 @@ class ConvParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
RType *Filter() const { return filter_; } RType *&Filter() const { return filter_; }
RType *Output() const { return output_; } RType *&Output() const { return output_; }
const vector<int> &Strides() const { return strides_; } const vector<int> &Strides() const { return strides_; }
...@@ -389,15 +389,28 @@ class ConvParam : public OpParam { ...@@ -389,15 +389,28 @@ class ConvParam : public OpParam {
const vector<int> &Dilations() const { return dilations_; } const vector<int> &Dilations() const { return dilations_; }
enum ExecMode {
EXEC_INVALID = 0,
EXEC_GEMM_FLOAT,
EXEC_DEPTHWISE3x3S1P1_FLOAT,
EXEC_DEPTHWISE3x3_FLOAT,
EXEC_WINOGRAD3X3_FLOAT,
EXEC_WINOGRAD5X5_FLOAT,
EXEC_GEMM_INT8,
};
ExecMode &ExecMode() const { return exec_mode_; }
const int &Groups() const { return groups; } const int &Groups() const { return groups; }
private: private:
RType *input_; RType *input_;
RType *output_; mutable RType *output_;
RType *filter_; mutable RType *filter_;
vector<int> strides_; vector<int> strides_;
vector<int> paddings_; vector<int> paddings_;
vector<int> dilations_; vector<int> dilations_;
mutable enum ExecMode exec_mode_;
int groups; int groups;
}; };
template <typename Dtype> template <typename Dtype>
......
...@@ -141,7 +141,7 @@ if (NOT FOUND_MATCH) ...@@ -141,7 +141,7 @@ if (NOT FOUND_MATCH)
target_link_libraries(test-googlenet-quali paddle-mobile) target_link_libraries(test-googlenet-quali paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-conv-op operators/test_cov_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-op paddle-mobile) target_link_libraries(test-conv-op paddle-mobile)
# gen test # gen test
...@@ -220,10 +220,6 @@ if (NOT FOUND_MATCH) ...@@ -220,10 +220,6 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
target_link_libraries(test-dequantize-op paddle-mobile) target_link_libraries(test-dequantize-op paddle-mobile)
# test int8 conv op
ADD_EXECUTABLE(test-int8-conv-op operators/test_int8_conv_op.cpp test_helper.h test_include.h)
target_link_libraries(test-int8-conv-op paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-log common/test_log.cpp) ADD_EXECUTABLE(test-log common/test_log.cpp)
target_link_libraries(test-log paddle-mobile) target_link_libraries(test-log paddle-mobile)
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
// Reference convolution for checking results: // Reference convolution from Caffe for checking results.
// accumulate through explicit loops over input, output, and filters. // accumulate through explicit loops over input, output, and filters.
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void conv2d(const framework::Tensor *input, const framework::Tensor *filter, void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
...@@ -129,7 +129,7 @@ void conv2d(const framework::Tensor *input, const framework::Tensor *filter, ...@@ -129,7 +129,7 @@ void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
} }
template <typename Itype, typename Otype, int Kernel, int Pad, int Stride> template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
int TestConvOp() { int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
int kernel_h = Kernel; int kernel_h = Kernel;
int kernel_w = Kernel; int kernel_w = Kernel;
int pad_h = Pad; int pad_h = Pad;
...@@ -140,10 +140,10 @@ int TestConvOp() { ...@@ -140,10 +140,10 @@ int TestConvOp() {
int dilation_w = 1; int dilation_w = 1;
int batch_size = 1; int batch_size = 1;
int input_c = 3; int input_c = in_channels;
int input_h = 100; int input_h = in_height;
int input_w = 100; int input_w = in_width;
int output_c = 10; int output_c = out_channels;
framework::DDim input_shape = framework::DDim input_shape =
framework::make_ddim({batch_size, input_c, input_h, input_w}); framework::make_ddim({batch_size, input_c, input_h, input_w});
framework::DDim filter_shape = framework::DDim filter_shape =
...@@ -158,7 +158,7 @@ int TestConvOp() { ...@@ -158,7 +158,7 @@ int TestConvOp() {
auto input_var = scope.get()->Var("input"); auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>(); auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<Itype>(input, input_shape, -20, 20); SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
auto filter_var = scope.get()->Var("filter"); auto filter_var = scope.get()->Var("filter");
auto filter = filter_var->template GetMutable<framework::LoDTensor>(); auto filter = filter_var->template GetMutable<framework::LoDTensor>();
...@@ -174,8 +174,9 @@ int TestConvOp() { ...@@ -174,8 +174,9 @@ int TestConvOp() {
auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs, auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
scope); scope);
// struct timespec ts_begin, ts_end;
op->InferShape(); op->InferShape();
op->Init();
// struct timespec ts_begin, ts_end;
// warmup // warmup
// op->Run(); // op->Run();
// clock_gettime(CLOCK_MONOTONIC, &ts_begin); // clock_gettime(CLOCK_MONOTONIC, &ts_begin);
...@@ -202,7 +203,8 @@ int TestConvOp() { ...@@ -202,7 +203,8 @@ int TestConvOp() {
const Otype *output_data = output->data<Otype>(); const Otype *output_data = output->data<Otype>();
Otype *output_cmp_data = output_cmp.data<Otype>(); Otype *output_cmp_data = output_cmp.data<Otype>();
for (int i = 0; i < output->numel(); ++i) { for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], float gap = output_data[i] - output_cmp_data[i];
PADDLE_MOBILE_ENFORCE(std::abs(gap / output_data[i]) < 1e-3,
"output[%d] = %d, output_cmp[%d] = %d", i, "output[%d] = %d, output_cmp[%d] = %d", i,
output_data[i], i, output_cmp_data[i]); output_data[i], i, output_cmp_data[i]);
} }
...@@ -212,68 +214,104 @@ int TestConvOp() { ...@@ -212,68 +214,104 @@ int TestConvOp() {
} // namespace paddle_mobile } // namespace paddle_mobile
int main() { int main(int argc, char *argv[]) {
if (argc < 5) {
LOG(paddle_mobile::kLOG_INFO)
<< "Usage:\n"
<< " ./test-int8-conv-op in_channels in_height in_width out_channels\n"
<< " params:\n"
<< " -in_channels: int, input image's channels\n"
<< " -in_height: int, input image's height\n"
<< " -in_width: int, input image's width\n"
<< " -out_channels: int, conv output channels\n";
return 1;
}
int in_channels = atoi(argv[1]);
int in_height = atoi(argv[2]);
int in_width = atoi(argv[3]);
int out_channels = atoi(argv[4]);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 0, stride = 2 // kernel = 7, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 1, stride = 2 // kernel = 7, pad = 1, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 3, stride = 2 // kernel = 7, pad = 3, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 0, stride = 1 // kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 1, stride = 1 // kernel = 7, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 3, stride = 1 // kernel = 7, pad = 3, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 5, stride = 3 // kernel = 7, pad = 5, stride = 3
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 3, stride = 4 // kernel = 7, pad = 3, stride = 4
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(); paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels, in_height,
LOG(paddle_mobile::kLOG_INFO) << "\n"; in_width, out_channels);
// kernel = 3, pad = 0, stride = 1 // kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(); paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 3, pad = 0, stride = 1 // kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
paddle_mobile::TestConvOp<float, float, 3, 0, 1>(); paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
LOG(paddle_mobile::kLOG_INFO) << "\n"; in_width, out_channels);
// kernel = 3, pad = 1, stride = 1 // kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(); paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 3, pad = 1, stride = 1 // kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<float, float, 3, 1, 1>(); paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
LOG(paddle_mobile::kLOG_INFO) << "\n"; in_width, out_channels);
// kernel = 5, pad = 0, stride = 1 // kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(); paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 5, pad = 0, stride = 1 // kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
paddle_mobile::TestConvOp<float, float, 5, 0, 1>(); paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
LOG(paddle_mobile::kLOG_INFO) << "\n"; in_width, out_channels);
// kernel = 5, pad = 2, stride = 1 // kernel = 5, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(); paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 5, pad = 2, stride = 1 // kernel = 5, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1"; LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
paddle_mobile::TestConvOp<float, float, 5, 2, 1>(); paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
in_width, out_channels);
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
#include "operators/conv_op.h"
int main() {
paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
// ../models/image_classification_resnet.inference.model
auto program = loader.Load(g_googlenet);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
Executor4Test<paddle_mobile::GPU_MALI, paddle_mobile::operators::ConvOp<
paddle_mobile::GPU_MALI, float>>
executor(program, "conv2d");
paddle_mobile::framework::Tensor input;
GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
// // use SetupTensor if not has local input image .
// SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
// static_cast<float>(1));
auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
auto output_ptr = output->data<float>();
for (int j = 0; j < 20; ++j) {
DLOG << " value of output: " << output_ptr[j];
}
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册