提交 4f91d196 编写于 作者: xiebaiyuan's avatar xiebaiyuan

Merge remote-tracking branch 'upstream/develop' into develop

......@@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0)
project(paddle-mobile)
# select the platform to build
option(CPU "armv7 with neon support" OFF)
option(CPU "armv7 with neon support" ON)
option(MALI_GPU "mali gpu support" OFF)
option(FPGA "fpga support" ON)
option(FPGA "fpga support" OFF)
option(USE_OPENMP "openmp support" OFF)
option(DEBUGING "enable debug mode" ON)
......
# FPGA开发文档
FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功,预测结果正确。
## 准备硬件
___
1. 购买Xilinx ZCU102 revision1.0 开发板
2. 另外下载Xilinx ZCU102 Ubuntu[镜像文件](https://www.xilinx.com/member/forms/download/xef.html?filename=Ubuntu_Desktop_Release_2018_1.zip),并烧录进SD卡。
* Windowns系统可使用Win32DiskImager
* Linux系统使用dd命令:dd if=name.img of=/dev/sdb
2. 将SD卡插入电脑,替换分区1中已有的BOOT.BIN、image.ub为[BOOT.BIN、image.ub](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
3. 将SD卡插入ZCU102开发板,设置板拨码开关为SD卡启动,上电启动Linux系统.
3. 装载驱动:sudo insmod [fpgadrv.ko](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
## 编译工程
___
1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
2. 进入paddle-mobile根目录, CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
2. 执行以下命令,可在./test/build下生成test-resnet50可执行程序。
* mkdir build
* cd build
* cmake ..
* make
## 准备模型和数据
___
1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。
2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG,分类标签为80, 对应着"black grouse".
## 运行程序
___
1. 进入./test/build目录。
2. sudo ./test-resnet50
3. 如果于DEBUG选项是否打开,屏幕会输出很多中间打印信息。最终打印出预测分类结果为80。
......@@ -22,6 +22,7 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm";
const char *G_OP_TYPE_BOX_CODER = "box_coder";
const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
......@@ -99,6 +100,7 @@ std::unordered_map<
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
......
......@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdlib>
#include <cstring>
#include <string>
#include "common/enforce.h"
#include "common/log.h"
#pragma once
namespace paddle_mobile {
template <int ID, typename Type>
struct IDToType {
typedef Type type_t;
......
......@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
if (num_per_div_before_alignment == num_per_div_after_alignment) {
return;
}
int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned =
......
......@@ -156,7 +156,7 @@ class AttrReader {
template <typename T>
inline T Get(const string &name) const {
PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
"%s should be in AttributeMap", name);
"%s should be in AttributeMap", name.c_str());
return ((Attribute)attrs_.at(name)).Get<T>();
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/data_type.h"
#include <stdint.h>
#include <string>
#include <unordered_map>
namespace paddle_mobile {
namespace framework {
struct DataTypeMap {
std::unordered_map<std::type_index,
_PaddleMobile__Framework__Proto__VarType__Type>
cpp_to_proto_;
std::unordered_map<int, std::type_index> proto_to_cpp_;
std::unordered_map<int, std::string> proto_to_str_;
std::unordered_map<std::type_index, size_t> cpp_to_size_;
};
static DataTypeMap* InitDataTypeMap();
// C++11 removes the need for manual locking. Concurrent execution shall wait if
// a static local variable is already being initialized.
// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
static DataTypeMap& gDataTypeMap() {
static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
return *g_data_type_map_;
}
template <typename T>
static inline void RegisterType(
DataTypeMap* map, _PaddleMobile__Framework__Proto__VarType__Type proto_type,
const std::string& name) {
map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
map->cpp_to_proto_.emplace(typeid(T), proto_type);
map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
map->cpp_to_size_.emplace(typeid(T), sizeof(T));
}
static DataTypeMap* InitDataTypeMap() {
auto retv = new DataTypeMap();
#define RegType(cc_type, proto_type) \
RegisterType<cc_type>(retv, proto_type, #cc_type)
// NOTE: Add your customize type here.
// RegType(float16, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16);
RegType(float, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32);
RegType(double, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64);
RegType(int, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32);
RegType(int64_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64);
RegType(bool, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL);
RegType(size_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T);
RegType(int16_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16);
RegType(uint8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8);
RegType(int8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8);
#undef RegType
return retv;
}
_PaddleMobile__Framework__Proto__VarType__Type ToDataType(
std::type_index type) {
auto it = gDataTypeMap().cpp_to_proto_.find(type);
if (it != gDataTypeMap().cpp_to_proto_.end()) {
return it->second;
}
PADDLE_MOBILE_THROW_EXCEPTION("Not support %s as tensor type", type.name());
}
std::type_index ToTypeIndex(
_PaddleMobile__Framework__Proto__VarType__Type type) {
auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
if (it != gDataTypeMap().proto_to_cpp_.end()) {
return it->second;
}
PADDLE_MOBILE_THROW_EXCEPTION(
"Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
"tensor type",
static_cast<int>(type));
}
std::string DataTypeToString(
const _PaddleMobile__Framework__Proto__VarType__Type type) {
auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
if (it != gDataTypeMap().proto_to_str_.end()) {
return it->second;
}
PADDLE_MOBILE_THROW_EXCEPTION(
"Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
"tensor type",
static_cast<int>(type));
}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <typeindex>
#include "common/enforce.h"
#include "framework/framework.pb-c.h"
namespace paddle_mobile {
namespace framework {
extern _PaddleMobile__Framework__Proto__VarType__Type ToDataType(
std::type_index type);
extern std::type_index ToTypeIndex(
_PaddleMobile__Framework__Proto__VarType__Type type);
template <typename Visitor>
inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
Visitor visitor) {
switch (type) {
// case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16:
// visitor.template apply<float16>();
// break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32:
visitor.template apply<float>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64:
visitor.template apply<double>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32:
visitor.template apply<int>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64:
visitor.template apply<int64_t>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL:
visitor.template apply<bool>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8:
visitor.template apply<uint8_t>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16:
visitor.template apply<int16_t>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8:
visitor.template apply<int8_t>();
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Not supported %d", type);
}
}
extern std::string DataTypeToString(
const _PaddleMobile__Framework__Proto__VarType__Type type);
inline std::ostream& operator<<(
std::ostream& out,
const _PaddleMobile__Framework__Proto__VarType__Type& type) {
out << DataTypeToString(type);
return out;
}
} // namespace framework
} // namespace paddle_mobile
......@@ -64,6 +64,9 @@ limitations under the License. */
// load requared ops
LOAD_OP(feed)
LOAD_OP(fetch)
#ifdef FILL_CONSTANT_OP
LOAD_OP(fill_constant)
#endif
#ifdef BATCHNORM_OP
LOAD_OP2(batch_norm, CPU, MALI_GPU);
#endif
......
......@@ -18,9 +18,9 @@ limitations under the License. */
#include <vector>
#include "framework/lod_tensor.h"
#include "framework/mixed_vector.h"
#include "framework/tensor.h"
#include "memory/t_malloc.h"
#include "mixed_vector.h"
namespace paddle_mobile {
namespace framework {
......
......@@ -343,7 +343,9 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
} else if (tensor.type() == typeid(int64_t)) {
printer << tensor.data<int64_t>()[i] << " ";
} else if (tensor.type() == typeid(int8_t)) {
printer << static_cast<int32_t>(tensor.data<int8_t>()[i]) << " ";
printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
} else if (tensor.type() == typeid(int32_t)) {
printer << tensor.data<int32_t>()[i] << " ";
}
}
#endif
......
......@@ -80,12 +80,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
}
template <typename Dtype>
void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
bool quant_uint8 = false) {
char **data_buf = reinterpret_cast<char **>(data);
int64_t size = tensor->numel();
Dtype *tensor_data = tensor->mutable_data<Dtype>();
if (0) {
// TODO(hjchen2) should be moved into operator init function
if (quant_uint8) {
// should be moved into operator init function
float min_value;
float max_value;
memcpy(&min_value, data_buf, sizeof(float));
......@@ -141,7 +142,8 @@ void Executor<Dtype, P>::LoadMemory(
// parse tensor from stream
switch (tensor_desc.DataType()) {
case framework::VARTYPE_TYPE_FP32:
LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
program_.quantification);
break;
case framework::VARTYPE_TYPE_INT8:
LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
......
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEQUANT_OP
#include "operators/dequantize_op.h"
namespace paddle_mobile {
......@@ -30,3 +32,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
#endif
#endif
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEQUANT_OP
#pragma once
#include <string>
......@@ -41,3 +43,5 @@ class DequantizeOp
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef ELEMENTWISEMUL_OP
#include "elementwise_mul_op.h"
#include "operators/elementwise_mul_op.h"
namespace paddle_mobile {
namespace operators {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FILL_CONSTANT_OP
#include "operators/fill_constant_op.h"
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FILL_CONSTANT_OP
#pragma once
#include <string>
#include "framework/data_type.h"
#include "framework/operator.h"
#include "framework/selected_rows.h"
#include "operators/math/math_function.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using std::string;
template <typename DeviceType, typename T>
class FillConstantOp : public framework::OperatorBase<DeviceType> {
public:
FillConstantOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope),
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const {
auto data_type =
static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
param_.DataDtype());
framework::Tensor *tensor = nullptr;
auto value = param_.Value();
auto *outvar = param_.OutVar();
if (outvar->template IsType<framework::LoDTensor>()) {
tensor = outvar->template GetMutable<framework::LoDTensor>();
} else if (outvar->template IsType<framework::SelectedRows>()) {
tensor = outvar->template GetMutable<framework::SelectedRows>()
->mutable_value();
} else {
PADDLE_MOBILE_THROW_EXCEPTION(
"fill constant op's output only"
"supports SelectedRows and LoDTensor");
}
tensor->Resize(framework::make_ddim(param_.Shape()));
tensor->mutable_data(framework::ToTypeIndex(data_type));
math::set_constant(tensor, value);
}
void Init() {}
void InferShape() const {
PADDLE_MOBILE_ENFORCE(
param_.Out() != nullptr,
"Output (Out) of fill_constant op should not be null.");
framework::DDim ddim = framework::make_ddim(param_.Shape());
param_.Out()->Resize(ddim);
}
protected:
FillConstantParam<DeviceType> param_;
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CPU
#ifdef DEQUANT_OP
#include "operators/kernel/dequantize_kernel.h"
......@@ -38,7 +38,8 @@ void DequantizeKernel<CPU, float>::Compute(
const int32_t *x = input->data<const int32_t>();
float *y = output->mutable_data<float>();
size_t size = output->numel();
float scale = 1.f / (activation_scale * weight_scale);
// float scale = 1.f / (activation_scale * weight_scale);
float scale = activation_scale / weight_scale;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CPU
#ifdef QUANT_OP
#include "operators/kernel/quantize_kernel.h"
#include <cmath>
......@@ -225,7 +225,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel();
#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
for (size_t i = 0; i < loop; ++i) {
......@@ -280,17 +280,18 @@ void QuantizeKernel<CPU, float>::Compute(
}
max_abs = std::max(max_abs, 1e-6f);
// only support int8 currently
float online_scale = 127 / max_abs;
param.online_scale_->mutable_data<float>()[0] = online_scale;
float scale = 127 / max_abs;
param.online_scale_->mutable_data<float>()[0] = max_abs;
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
quantize_round_to_even(input, online_scale, output);
quantize_round_to_even(input, scale, output);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
quantize_round_to_zero(input, online_scale, output);
quantize_round_to_zero(input, scale, output);
break;
case ROUND_NEAREST_AWAY_ZERO:
quantize_round_to_nearest(input, online_scale, output);
quantize_round_to_nearest(input, scale, output);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
break;
......
......@@ -16,24 +16,27 @@ limitations under the License. */
#pragma once
#include <vector>
#include "operators/math/conv_arm_int8.h"
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype>
inline void ConvBasic(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const std::vector<int> strides = param.Strides();
const std::vector<int> paddings = param.Paddings();
const std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
......@@ -57,7 +60,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col.mutable_data<Dtype>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
......@@ -76,8 +79,8 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
math::Vol2ColFunctor<CPU, Dtype> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Dtype> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
......@@ -96,6 +99,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
......@@ -104,29 +108,85 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
math::matmul<Dtype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
}
inline void ConvCompute_int8(const ConvParam<CPU> &param) {
typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel,
Tensor *output);
static ConvFunc conv_funcs_table[7][5] = {
{0, 0, 0, 0, 0}, // k = 1
{0, 0, 0, 0, 0}, {conv3x3s1_int8, 0, 0, 0, 0}, // k = 3
{0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0}, // k = 5
{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, // k = 7
};
const Tensor *input = param.Input();
Tensor *filter = param.Filter();
Tensor *output = param.Output();
int groups = param.Groups();
const std::vector<int> &strides = param.Strides();
const std::vector<int> &paddings = param.Paddings();
const std::vector<int> &dilations = param.Dilations();
int kernel_h = filter->dims()[2];
int kernel_w = filter->dims()[3];
output->mutable_data<int32_t>();
ConvFunc conv_func = 0;
if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w &&
kernel_h < 8 && groups == 1 && dilations[0] == dilations[1] &&
dilations[1] == 1) {
conv_func = conv_funcs_table[kernel_h - 1][strides[0] - 1];
}
if (conv_func) {
int batch_size = input->dims()[0];
math::PadFunctor<CPU, int8_t> pad;
Tensor input_pad;
for (int i = 0; i < batch_size; ++i) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
if (paddings[0] == 0 && paddings[1] == 0) {
input_pad = in_batch;
} else {
framework::DDim pad_shape = in_batch.dims();
pad_shape[2] += 2 * paddings[0];
pad_shape[3] += 2 * paddings[1];
input_pad.mutable_data<int8_t>(pad_shape);
pad(in_batch, paddings[0], paddings[1], &input_pad);
}
conv_func(input_pad, *filter, &out_batch);
}
} else {
ConvBasic<int8_t>(param);
}
}
template <typename P>
void ConvCompute(const ConvParam<CPU> &param) {
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
nullptr, false);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3) {
math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
param.Filter(), nullptr, param.Output(), false);
if (param.Input()->type() == typeid(int8_t)) {
ConvCompute_int8(param);
} else {
ConvBasic(param);
param.Output()->mutable_data<float>();
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
nullptr, false);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3) {
math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
param.Filter(), nullptr, param.Output(), false);
} else {
ConvBasic<float>(param);
}
}
}
......
......@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
Bias, false);
} else {
ConvBasic(param);
ConvBasic<float>(param);
}
}
......
......@@ -15,8 +15,12 @@ limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once
#include "operators/math/elementwise_op_function.h"
#include "operators/op_param.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
......@@ -33,8 +37,61 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
Tensor *Out = param.Out();
Out->mutable_data<float>();
int axis = param.Axis();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
const auto &x_dims = input_x->dims();
const auto &y_dims = input_y->dims();
/// axis = -1 represent the last dimensions.
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
size_t batch = 1;
size_t channels = 1;
size_t elementwise_num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels *= y_dims[i];
}
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
elementwise_num *= x_dims[i];
}
const float *bias_data = input_y->data<float>();
const float *input_data = input_x->data<float>();
float *output_data = Out->mutable_data<float>();
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
size_t offset = (i * channels + j) * elementwise_num;
const float *input = input_data + offset;
const float *bias = bias_data + j;
float *output = output_data + offset;
int loop = elementwise_num >> 0x4;
int remain = elementwise_num & 0xF;
for (int k = 0; k < loop; ++k) {
float32x4_t rb = vdupq_n_f32(*bias);
float32x4_t r0 = vld1q_f32(input);
float32x4_t r1 = vld1q_f32(input + 4);
float32x4_t r2 = vld1q_f32(input + 8);
float32x4_t r3 = vld1q_f32(input + 12);
r0 = vaddq_f32(r0, rb);
r1 = vaddq_f32(r1, rb);
r2 = vaddq_f32(r2, rb);
r3 = vaddq_f32(r3, rb);
vst1q_f32(output, r0);
vst1q_f32(output + 4, r1);
vst1q_f32(output + 8, r2);
vst1q_f32(output + 12, r3);
input += 16;
output += 16;
}
for (int k = 0; k < remain; ++k) {
output[k] = input[k] + *bias;
}
}
}
#else
ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
AddFunctor<float>(), Out);
#endif
}
template class ElementwiseAddKernel<CPU, float>;
......
......@@ -17,6 +17,9 @@ limitations under the License. */
#include <operators/math/transform.h>
#include "operators/op_param.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
......@@ -37,71 +40,100 @@ void ReluCompute(const ReluParam<CPU> &param) {
auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel();
// if (numel > 64) {
// asm volatile(
// "pld [%[input_x_ptr], #0] \n\t"
// "vmov.f32 q8, #0.0 \n\t"
// "subs %[num], %[num], #32 \n\t"
// "blt end_num_%= \n\t"
// "loop_num_%=: \n\t"
// "pld [%[input_x_ptr], #1024] \n\t"
//
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
//
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
//
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
//
// "subs %[num], %[num], #32 \n\t"
// "bge loop_num_%= \n\t"
// "end_num_%=: \n\t"
// "cmp %[num], #0 \n\t"
// "bge end_%= \n\t"
// "mov r6, #4 \n\t"
// "mul r5, %[num], r6 \n\t"
// "add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
// "add %[out_ptr], %[out_ptr], r5 \n\t"
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
// "end_%=: \n\t"
// :
// :
// [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
// "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
// "q7", "q8", "r5",
// "r6");
// } else {
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
// }
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#if __aarch64__
if (numel > 0) {
int loop = numel >> 0x4;
int remain = numel & 0xF;
float32x4_t zero = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(input_x_ptr);
float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
r0 = vmaxq_f32(r0, zero);
r1 = vmaxq_f32(r1, zero);
r2 = vmaxq_f32(r2, zero);
r3 = vmaxq_f32(r3, zero);
vst1q_f32(out_ptr, r0);
vst1q_f32(out_ptr + 4, r1);
vst1q_f32(out_ptr + 8, r2);
vst1q_f32(out_ptr + 12, r3);
input_x_ptr += 16;
out_ptr += 16;
}
for (int i = 0; i < remain; ++i) {
out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
}
#else
if (numel > 64) {
asm volatile(
"pld [%[input_x_ptr], #0] \n\t"
"vmov.f32 q8, #0.0 \n\t"
"subs %[num], %[num], #32 \n\t"
"blt end_num_%= \n\t"
"loop_num_%=: \n\t"
"pld [%[input_x_ptr], #1024] \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"subs %[num], %[num], #32 \n\t"
"bge loop_num_%= \n\t"
"end_num_%=: \n\t"
"cmp %[num], #0 \n\t"
"bge end_%= \n\t"
"mov r6, #4 \n\t"
"mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t"
:
:
[out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
"r6");
#endif
} else {
#endif
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
}
#endif
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -15,11 +15,14 @@ limitations under the License. */
#ifdef SUM_OP
#pragma once
#include <vector>
#include "operators/math/selected_rows_functor.h"
namespace paddle_mobile {
namespace operators {
using LoDTensorArray = std::vector<LoDTensor>;
template <typename P>
void SumCompute(const SumParam<CPU> &param) {
auto inputsvars = param.InputsVars();
......@@ -63,31 +66,21 @@ void SumCompute(const SumParam<CPU> &param) {
std::unique_ptr<framework::SelectedRows> in0;
if (in_place) {
// If is in_place, we store the input[0] to in0
auto *in_sel0 = inputsvars[0]->Get<SelectedRows>();
auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
auto &rows = in_sel0->rows();
//#ifdef PADDLE_WITH_CUDA
// std::vector<int64_t> rows_in_cpu;
// rows_in_cpu.reserve(rows.size());
// for (auto item : rows) {
// rows_in_cpu.push_back(item);
// }
// in0.reset(new framework::SelectedRows(rows_in_cpu,
// in_sel0.height()));
//#else
in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
//#endif
in0->mutable_value()->ShareDataWith(in_sel0->value());
}
auto get_selected_row = [&](size_t i) -> const SelectedRows & {
auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
if (i == 0 && in0) {
return *in0.get();
} else {
return *(inputsvars[i]->Get<SelectedRows>());
return *(inputsvars[i]->Get<framework::SelectedRows>());
}
};
auto *out = outvar->GetMutable<SelectedRows>();
auto *out = outvar->GetMutable<framework::SelectedRows>();
out->mutable_rows()->clear();
auto *out_value = out->mutable_value();
......@@ -150,8 +143,6 @@ void SumCompute(const SumParam<CPU> &param) {
}
}
} else {
if (outvar->IsType<framework::Tensor>()) {
}
PADDLE_MOBILE_THROW_EXCEPTION(
"Unexpected branch, output variable type is %s", outvar->Type().name());
}
......
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEQUANT_OP
#pragma once
#include "framework/operator.h"
......@@ -30,3 +32,5 @@ class DequantizeKernel
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -23,8 +23,6 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class ElementwiseMulKernel
: public framework::OpKernelBase<DeviceType,
......
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef QUANT_OP
#pragma once
#include "framework/operator.h"
......@@ -30,3 +32,5 @@ class QuantizeKernel
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -21,8 +21,6 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class SumKernel
: public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
......
此差异已折叠。
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
void conv3x3s1_int8(const framework::Tensor& input,
const framework::Tensor& weight, framework::Tensor* output);
void conv3x3s1_int8_4c(const framework::Tensor& input,
const framework::Tensor& weight,
framework::Tensor* output);
void conv5x5s1_int8(const framework::Tensor& input,
const framework::Tensor& weight, framework::Tensor* output);
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
// 对 B 分块
NC = L1 / (KC * sizeof(float));
if (NC == 0) {
NC == NR;
NC = NR;
} else {
int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
......
......@@ -22,9 +22,11 @@ limitations under the License. */
#define C(i, j) C[(i)*ldc + (j)]
#if __aarch64__
#define MR_INT8 4
#define MR 6
#define NR 16
#else
#define MR_INT8 4
#define MR 6
#define NR 8
#endif
......@@ -189,6 +191,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits function cluster begins
// 8 bits int small block inner product
void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
......@@ -199,6 +203,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
int8_t *bias);
// 8 bits int pack function
void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
......
此差异已折叠。
此差异已折叠。
......@@ -15,12 +15,31 @@ limitations under the License. */
#include "operators/math/math_function.h"
#include <cstring>
#include <string>
#include "framework/data_type.h"
#include "framework/tensor.h"
#include "operators/math/gemm.h"
namespace paddle_mobile {
namespace operators {
namespace math {
struct TensorSetConstant {
TensorSetConstant(framework::Tensor *tensor, float value)
: tensor_(tensor), value_(value) {}
template <typename T>
void apply() const {
auto *begin = tensor_->mutable_data<T>();
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
}
framework::Tensor *tensor_;
float value_;
};
void set_constant(framework::Tensor *tensor, float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()),
TensorSetConstant(tensor, value));
}
template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
......
......@@ -15,12 +15,15 @@ limitations under the License. */
#pragma once
#include <cmath>
#include <string>
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
void set_constant(framework::Tensor *tensor, float value);
template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/pad.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <typename T>
class PadFunctor<CPU, T> {
public:
void operator()(const framework::Tensor &input, const int pad_h,
const int pad_w, framework::Tensor *output) {
const T *in_data = input.data<T>();
T *out_data = output->mutable_data<T>();
const framework::DDim &input_shape = input.dims();
const framework::DDim &output_shape = output->dims();
// fill output with 0
memset(out_data, 0, sizeof(T) * output->numel());
// should make sure the shape of output is match with input
for (int i = 0; i < input_shape[0]; ++i) {
for (int c = 0; c < input_shape[1]; ++c) {
out_data += pad_h * output_shape[3];
for (int h = 0; h < input_shape[2]; ++h) {
memcpy(out_data + pad_w, in_data, sizeof(T) * input_shape[3]);
out_data += output_shape[3];
in_data += input_shape[3];
}
out_data += pad_h * output_shape[3];
}
}
}
};
template class PadFunctor<CPU, float>;
template class PadFunctor<CPU, int8_t>;
} // namespace math
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <typename DeviceType, typename T>
class PadFunctor {
public:
void operator()(const framework::Tensor &input, const int pad_h,
const int pad_w, framework::Tensor *output);
};
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -32,9 +32,6 @@ class Vol2ColFunctor<CPU, T> {
void operator()(const Tensor &vol, const std::vector<int> &dilations,
const std::vector<int> &strides,
const std::vector<int> &paddings, Tensor *col) const {
// PADDLE_ENFORCE(vol.dims().size() == 4);
// PADDLE_ENFORCE(col->dims().size() == 7);
int input_channels = vol.dims()[0];
int input_depth = vol.dims()[1];
int input_height = vol.dims()[2];
......@@ -48,32 +45,6 @@ class Vol2ColFunctor<CPU, T> {
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
// PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
// ((dilations[0] * (filter_depth - 1)
// + 1))) /
// strides[0] +
// 1,
// output_depth,
// "input_depth and output_depth are "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
// ((dilations[1] * (filter_height -
// 1) + 1))) /
// strides[1] +
// 1,
// output_height,
// "input_height and output_height are
// "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
// ((dilations[2] * (filter_width - 1)
// + 1))) /
// strides[2] +
// 1,
// output_width,
// "input_width and output_width are "
// "mismatching.");
const T *vol_data = vol.data<T>();
T *col_data = col->data<T>();
......@@ -119,9 +90,6 @@ class Col2VolFunctor<CPU, T> {
void operator()(const Tensor &col, const std::vector<int> &dilations,
const std::vector<int> &strides,
const std::vector<int> &paddings, Tensor *vol) const {
// PADDLE_ENFORCE(vol->dims().size() == 4);
// PADDLE_ENFORCE(col.dims().size() == 7);
int input_channels = vol->dims()[0];
int input_depth = vol->dims()[1];
int input_height = vol->dims()[2];
......@@ -135,31 +103,6 @@ class Col2VolFunctor<CPU, T> {
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
// PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
// ((dilations[0] * (filter_depth - 1)
// + 1))) /
// strides[0] +
// 1,
// output_depth,
// "input_depth and output_depth are "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
// ((dilations[1] * (filter_height -
// 1) + 1))) /
// strides[1] +
// 1,
// output_height,
// "input_height and output_height are
// "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
// ((dilations[2] * (filter_width - 1)
// + 1))) /
// strides[2] +
// 1,
// output_width,
// "input_width and output_width are "
// "mismatching.");
T *vol_data = vol->data<T>();
const T *col_data = col.data<T>();
......@@ -195,9 +138,9 @@ class Col2VolFunctor<CPU, T> {
};
template class Vol2ColFunctor<CPU, float>;
template class Vol2ColFunctor<CPU, double>;
template class Vol2ColFunctor<CPU, int8_t>;
template class Col2VolFunctor<CPU, float>;
template class Col2VolFunctor<CPU, double>;
template class Col2VolFunctor<CPU, int8_t>;
} // namespace math
} // namespace operators
......
......@@ -1063,6 +1063,42 @@ class FetchParam : public OpParam {
RType *out_;
};
#ifdef FILL_CONSTANT_OP
template <typename Dtype>
class FillConstantParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FillConstantParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope) {
out_var_ = OutVarFrom(outputs, scope);
out_ = OutFrom<GType>(outputs, scope);
dtype_ = GetAttr<int>("dtype", attrs);
shape_ = GetAttr<vector<int>>("shape", attrs);
value_ = GetAttr<float>("value", attrs);
}
Variable *OutVar() const { return out_var_; }
RType *Out() const { return out_; }
const int &DataDtype() const { return dtype_; }
const vector<int> &Shape() const { return shape_; }
const float &Value() const { return value_; }
private:
Variable *out_var_;
RType *out_;
int dtype_;
vector<int> shape_;
float value_;
};
#endif
#ifdef TRANSPOSE_OP
template <typename Dtype>
class TransposeParam : public OpParam {
......@@ -2294,6 +2330,7 @@ class ShapeParam : public OpParam {
};
#endif
#ifdef QUANT_OP
template <typename Dtype>
class QuantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
......@@ -2304,14 +2341,12 @@ class QuantizeParam : public OpParam {
const AttributeMap &attrs, const Scope &scope) {
input_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
if (HasAttr("is_static", attrs)) {
is_static_ = GetAttr<bool>("is_static", attrs);
}
// online
// scale = max(abs(x))
online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
// offline
if (HasAttr("static_scale", attrs)) {
is_static_ = true;
static_scale_ = GetAttr<float>("static_scale", attrs);
}
// x = round(scale * x)
......@@ -2333,9 +2368,11 @@ class QuantizeParam : public OpParam {
float static_scale_ = 1.0f;
// round method type
// nearest_zero and nearest_even is valid currently
RoundType round_type_ = ROUND_NEAREST_TO_EVEN;
RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
};
#endif
#ifdef DEQUANT_OP
template <typename Dtype>
class DequantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
......@@ -2363,6 +2400,7 @@ class DequantizeParam : public OpParam {
RType *activation_scale_;
float weight_scale_;
};
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef QUANT_OP
#include "operators/quantize_op.h"
#include <vector>
......@@ -33,3 +35,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
#endif
#endif
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef QUANT_OP
#pragma once
#include <string>
......@@ -40,3 +42,5 @@ class QuantizeOp : public framework::OperatorWithKernel<
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -26,7 +26,7 @@ void SumOp<Dtype, T>::InferShape() const {
auto inputs = this->param_.Inputs();
const size_t n = inputs.size();
std::vector<DDim> inputs_dims;
std::vector<framework::DDim> inputs_dims;
inputs_dims.reserve(n);
for (int i = 0; i < n; i++) {
inputs_dims.push_back(inputs[i]->dims());
......
......@@ -192,6 +192,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
target_link_libraries(test-polygon-box-transform-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h)
target_link_libraries(test-fill-constant-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
target_link_libraries(test-reshape-op paddle-mobile)
......@@ -216,6 +220,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
target_link_libraries(test-dequantize-op paddle-mobile)
# test int8 conv op
ADD_EXECUTABLE(test-int8-conv-op operators/test_int8_conv_op.cpp test_helper.h test_include.h)
target_link_libraries(test-int8-conv-op paddle-mobile)
# gen test log
ADD_EXECUTABLE(test-log common/test_log.cpp)
target_link_libraries(test-log paddle-mobile)
......
......@@ -25,27 +25,31 @@ int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
#endif
paddle_mobile.SetThreadNum(4);
bool optimize = true;
paddle_mobile.SetThreadNum(1);
bool optimize = false;
auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
std::vector<float> input;
std::vector<float> output;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
// // 预热十次
// for (int i = 0; i < 10; ++i) {
// output = paddle_mobile.Predict(input, dims);
// }
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
output = paddle_mobile.Predict(input, dims);
}
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl;
for (int i = 0; i < output.size(); ++i) {
DLOG << "result[" << i << "] = " << output[i];
}
}
return 0;
}
......@@ -59,7 +59,7 @@ int TestDequqntizeOp() {
framework::Tensor output_cmp;
output_cmp.Resize(dim);
float dequant_scale = 1.f / (1.27 * 1.74);
float dequant_scale = 1.27 / 1.74;
dequantize(input, dequant_scale, &output_cmp);
const float* output_cmp_data = output_cmp.data<float>();
for (int i = 0; i < output->numel(); ++i) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/fill_constant_op.h"
namespace paddle_mobile {
namespace framework {
template <typename Dtype>
class TestFillConstantOp {
public:
explicit TestFillConstantOp(const Program<Dtype> p) : program_(p) {
if (use_optimize_) {
to_predict_program_ = program_.optimizeProgram;
} else {
to_predict_program_ = program_.originProgram;
}
const std::vector<std::shared_ptr<BlockDesc>> blocks =
to_predict_program_->Blocks();
for (auto block_desc : blocks) {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (auto op : ops) {
if (op->Type() == "fill_constant") {
DLOG << " attr size: " << op->GetAttrMap().size();
std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
for (std::unordered_map<std::string, Attribute>::iterator it =
attrs.begin();
it != attrs.end(); ++it) {
DLOG << " " << it->first << " " << it->second;
}
DLOG << " inputs size: " << op->GetInputs().size();
DLOG << " outputs size: " << op->GetOutputs().size();
DLOG << " output is : " << op->Output("Out")[0];
output_var_name = op->Output("Out")[0];
std::shared_ptr<operators::FillConstantOp<Dtype, float>> op_ptr =
std::make_shared<operators::FillConstantOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), program_.scope);
ops_of_block_[*block_desc.get()].push_back(op_ptr);
}
}
}
}
std::shared_ptr<Tensor> predict() {
auto scope = program_.scope;
Variable *output = scope->Var(output_var_name);
auto *output_tensor = output->GetMutable<LoDTensor>();
std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
out_tensor.reset(output_tensor);
predict(0);
return out_tensor;
}
private:
const framework::Program<Dtype> program_;
std::shared_ptr<ProgramDesc> to_predict_program_;
std::map<framework::BlockDesc,
std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
ops_of_block_;
bool use_optimize_ = false;
string output_var_name;
void predict(int block_id) {
std::shared_ptr<BlockDesc> to_predict_block =
to_predict_program_->Block(block_id);
for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
auto op = ops_of_block_[*to_predict_block.get()][j];
op->Run();
}
}
};
template class TestFillConstantOp<CPU>;
} // namespace framework
} // namespace paddle_mobile
int main() {
DLOG << "----------**********----------";
DLOG << "begin to run FillConstant Test";
paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_ocr) + "/model",
std::string(g_ocr) + "/params");
paddle_mobile::framework::TestFillConstantOp<paddle_mobile::CPU>
testFillConstantOp(program);
auto output = testFillConstantOp.predict();
auto *output_ptr = output->data<float>();
DLOG << "output : ";
for (int i = 0; i < output->numel(); ++i) {
DLOG << " index " << i << " : " << output_ptr[i];
}
return 0;
}
此差异已折叠。
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/mul_op.h"
......
......@@ -18,14 +18,6 @@ limitations under the License. */
namespace paddle_mobile {
// static float g_test_data[50] = {
// -5.55, -5.5, -5.45, -5.0, -4.55, -4.5, -4.45, -4.0, -3.55, -3.5,
// -3.45, -3.01, -2.75, -2.5, -2.501, -2.49, -2.01, -1.75, -1.5, -1.25,
// -1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0, 1.25,
// 1.5, 1.75, 2.01, 2.49, 2.501, 2.5, 2.75, 3.01, 3.45, 3.5,
// 3.55, 4.0, 4.45, 4.5, 4.55, 5.0, 5.45, 5.5, 5.55, 6.0,
// };
static float find_abs_max(const Tensor *input) {
float max_abs = 0.f;
const float *x = input->data<const float>();
......@@ -60,6 +52,16 @@ static void quantize_round_to_even(const Tensor *input, const float scale,
}
}
static void quantize_round_to_nearest(const Tensor *input, const float scale,
Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel();
for (size_t i = 0; i < size; ++i) {
y[i] = round(x[i] * scale);
}
}
int TestQuqntizeOp() {
framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
......@@ -88,15 +90,16 @@ int TestQuqntizeOp() {
auto output_scale = output_scale_var->template Get<framework::LoDTensor>();
const float *output_scale_data = output_scale->data<float>();
float max_abs = find_abs_max(input);
float output_scale_cmp = 127 / max_abs;
float output_scale_cmp = find_abs_max(input);
PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0],
"output_scale = %.6f, output_scale_cmp = %.6f",
output_scale_cmp, output_scale_data[0]);
framework::Tensor output_cmp;
output_cmp.Resize(dim);
quantize_round_to_even(input, output_scale_cmp, &output_cmp);
float scale = 127 / output_scale_cmp;
// quantize_round_to_even(input, scale, &output_cmp);
quantize_round_to_nearest(input, scale, &output_cmp);
int8_t *output_cmp_data = output_cmp.data<int8_t>();
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
......
......@@ -188,6 +188,7 @@ if(NOT FOUND_MATCH)
set(ELEMENTWISEADD_OP ON)
set(ELEMENTWISESUB_OP ON)
set(IM2SEQUENCE_OP ON)
set(FILL_CONSTANT_OP ON)
set(FUSION_CONVADD_OP ON)
set(FUSION_CONVADDPRELU_OP ON)
set(FUSION_CONVADDRELU_OP ON)
......@@ -223,6 +224,8 @@ if(NOT FOUND_MATCH)
set(SHAPE_OP ON)
set(ELEMENTWISEMUL_OP ON)
set(SUM_OP ON)
set(QUANT_OP ON)
set(DEQUANT_OP ON)
endif()
# option(BATCHNORM_OP "" ON)
......@@ -231,6 +234,7 @@ endif()
# option(CONV_OP "" ON)
# option(DEPTHWISECONV_OP "" ON)
# option(ELEMENTWISEADD_OP "" ON)
# option(FILL_CONSTANT_OP "" ON)
# option(FUSION_CONVADD_OP "" ON)
# option(FUSION_CONVADDRELU_OP "" ON)
# option(FUSION_FC_OP "" ON)
......@@ -268,6 +272,9 @@ endif()
if (ELEMENTWISESUB_OP)
add_definitions(-DELEMENTWISESUB_OP)
endif()
if (FILL_CONSTANT_OP)
add_definitions(-DFILL_CONSTANT_OP)
endif()
if (FUSION_CONVADD_OP)
add_definitions(-DFUSION_CONVADD_OP)
endif()
......@@ -406,3 +413,10 @@ if (SUM_OP)
add_definitions(-DSUM_OP)
endif()
if (QUANT_OP)
add_definitions(-DQUANT_OP)
endif()
if (DEQUANT_OP)
add_definitions(-DDEQUANT_OP)
endif()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册