未验证 提交 bbfedb25 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge branch 'develop' into develop

...@@ -61,7 +61,14 @@ struct PaddleMobileException : public std::exception { ...@@ -61,7 +61,14 @@ struct PaddleMobileException : public std::exception {
} }
#else #else
#define PADDLE_MOBILE_THROW_EXCEPTION(...) #define PADDLE_MOBILE_THROW_EXCEPTION(...)
#define PADDLE_MOBILE_ENFORCE(stat, ...)
#define PADDLE_MOBILE_ENFORCE(stat, ...) \
{ \
if (stat) { \
} else { \
} \
}
#endif #endif
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "cstring"
#include "io/paddle_inference_api.h" #include "io/paddle_inference_api.h"
namespace paddle_mobile { namespace paddle_mobile {
......
...@@ -30,9 +30,6 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -30,9 +30,6 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope), scope),
param_(inputs, outputs, attrs, scope.get()) {} param_(inputs, outputs, attrs, scope.get()) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() {}
void InferShape() const { void InferShape() const {
auto out_dims = param_.Out()->dims(); auto out_dims = param_.Out()->dims();
...@@ -40,6 +37,29 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -40,6 +37,29 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
param_.Out()->Resize(out_dims); param_.Out()->Resize(out_dims);
} }
#ifdef PADDLE_MOBILE_FPGA
void RunImpl() const { fpga::PerformBypass(param_.FpgaArgs()); }
void Init() {
const Tensor *input = param_.InputX();
auto input_ptr = input->data<float>();
Tensor *output = param_.Out();
auto output_ptr = output->mutable_data<half>();
fpga::BypassArgs args;
args.convert_type = fpga::DATA_FP32_TO_FP16;
args.layout_type = fpga::LAYOUT_CHW_TO_HWC;
args.image.address = (void *)input_ptr;
args.image.channels = input->dims()[1];
args.image.height = input->dims()[2];
args.image.width = input->dims()[3];
args.output.address = output_ptr;
param_.SetFpgaArgs(args);
}
#else
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() {}
#endif
protected: protected:
FeedParam param_; FeedParam param_;
}; };
...@@ -54,4 +74,5 @@ USE_OP_CPU(feed); ...@@ -54,4 +74,5 @@ USE_OP_CPU(feed);
USE_OP_MALI_GPU(feed); USE_OP_MALI_GPU(feed);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(feed);
#endif #endif
...@@ -25,4 +25,5 @@ REGISTER_OPERATOR_CPU(fetch, ops::FetchOp); ...@@ -25,4 +25,5 @@ REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp); REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
#endif #endif
...@@ -54,4 +54,5 @@ USE_OP_CPU(fetch); ...@@ -54,4 +54,5 @@ USE_OP_CPU(fetch);
USE_OP_MALI_GPU(fetch); USE_OP_MALI_GPU(fetch);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fetch);
#endif #endif
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#pragma once #pragma once
#include <cmath> #include <cmath>
#include "framework/tensor.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -122,7 +124,7 @@ void BoxCoderCompute(const BoxCoderParam& param) { ...@@ -122,7 +124,7 @@ void BoxCoderCompute(const BoxCoderParam& param) {
auto col = input_priorbox->dims()[0]; auto col = input_priorbox->dims()[0];
auto len = input_priorbox->dims()[1]; auto len = input_priorbox->dims()[1];
Tensor* output_box = param.OutputBox(); framework::Tensor* output_box = param.OutputBox();
auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len}); auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
if (code_type == "encode_center_size") { if (code_type == "encode_center_size") {
......
...@@ -31,12 +31,7 @@ void ConvAddBasic(const FusionConvAddParam &param) { ...@@ -31,12 +31,7 @@ void ConvAddBasic(const FusionConvAddParam &param) {
Tensor bias = *param.Bias(); Tensor bias = *param.Bias();
int axis = param.Axis(); int axis = param.Axis();
Tensor *output = param.Output(); Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
float *output_data = output->data<float>();
float *biase_data = bias.data<float>(); float *biase_data = bias.data<float>();
for (int k = 0; k < output->numel(); ++k) {
output_data[k] = biase_data[k];
}
int groups = param.Groups(); int groups = param.Groups();
std::vector<int> strides = param.Strides(); std::vector<int> strides = param.Strides();
...@@ -113,7 +108,7 @@ void ConvAddBasic(const FusionConvAddParam &param) { ...@@ -113,7 +108,7 @@ void ConvAddBasic(const FusionConvAddParam &param) {
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false, math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1)); static_cast<float>(1), false, biase_data);
} }
} }
} }
......
...@@ -18,6 +18,9 @@ limitations under the License. */ ...@@ -18,6 +18,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "operators/math/depthwise_conv_3x3.h" #include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
......
...@@ -16,6 +16,10 @@ limitations under the License. */ ...@@ -16,6 +16,10 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -28,12 +32,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam &param) { ...@@ -28,12 +32,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam &param) {
Tensor bias = *param.Bias(); Tensor bias = *param.Bias();
int axis = param.Axis(); int axis = param.Axis();
Tensor *output = param.Output(); Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
float *output_data = output->data<float>();
float *biase_data = bias.data<float>(); float *biase_data = bias.data<float>();
for (int k = 0; k < output->numel(); ++k) {
output_data[k] = biase_data[k];
}
int groups = param.Groups(); int groups = param.Groups();
std::vector<int> strides = param.Strides(); std::vector<int> strides = param.Strides();
...@@ -111,7 +110,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam &param) { ...@@ -111,7 +110,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam &param) {
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false, math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1), true); static_cast<float>(1), true, biase_data);
} }
} }
} }
......
...@@ -17,6 +17,9 @@ limitations under the License. */ ...@@ -17,6 +17,9 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "operators/math/depthwise_conv_3x3.h" #include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -17,6 +17,9 @@ limitations under the License. */ ...@@ -17,6 +17,9 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "operators/math/depthwise_conv_3x3.h" #include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef ELEMENTWISEADD_OP #ifdef ELEMENTWISEADD_OP
#pragma once #pragma once
#include "operators/math/elementwise_op_function.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef FUSION_FC_OP #ifdef FUSION_FC_OP
#pragma once #pragma once
#include "operators/math/math_function.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -28,6 +30,7 @@ void FusionFcCompute(const FusionFcParam &param) { ...@@ -28,6 +30,7 @@ void FusionFcCompute(const FusionFcParam &param) {
int axis = param.Axis(); int axis = param.Axis();
Tensor *out = param.Out(); Tensor *out = param.Out();
auto *out_data = out->mutable_data<float>(); auto *out_data = out->mutable_data<float>();
float *bias_data = out->mutable_data<float>();
const Tensor x_matrix = const Tensor x_matrix =
input_x->dims().size() > 2 input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
...@@ -45,18 +48,18 @@ void FusionFcCompute(const FusionFcParam &param) { ...@@ -45,18 +48,18 @@ void FusionFcCompute(const FusionFcParam &param) {
PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0], PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
" out_dim.size must be 2."); " out_dim.size must be 2.");
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis); axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ") PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
int64_t classes = input_z->numel(); int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) { for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes); memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
} }
for (int i = 0; i < out->numel(); i++) { // for (int i = 0; i < out->numel(); i++) {
DLOG << out_data[i]; // DLOG << out_data[i];
} // }
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1)); out, static_cast<float>(1), false, bias_data);
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) { // if (out_dim.size() != 2) {
// out->Resize(out_dim); // out->Resize(out_dim);
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef LRN_OP #ifdef LRN_OP
#pragma once #pragma once
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
#include <map> #include <map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "framework/tensor.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -89,7 +91,8 @@ static inline T JaccardOverlap(const T* box1, const T* box2, ...@@ -89,7 +91,8 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
} }
template <typename T> template <typename T>
static inline void NMSFast(const Tensor& bbox, const Tensor& scores, static inline void NMSFast(const framework::Tensor& bbox,
const framework::Tensor& scores,
const T score_threshold, const T nms_threshold, const T score_threshold, const T nms_threshold,
const T eta, const int64_t top_k, const T eta, const int64_t top_k,
std::vector<int>* selected_indices) { std::vector<int>* selected_indices) {
...@@ -131,7 +134,8 @@ static inline void NMSFast(const Tensor& bbox, const Tensor& scores, ...@@ -131,7 +134,8 @@ static inline void NMSFast(const Tensor& bbox, const Tensor& scores,
} }
template <typename T> template <typename T>
void MultiClassNMS(const Tensor& scores, const Tensor& bboxes, void MultiClassNMS(const framework::Tensor& scores,
const framework::Tensor& bboxes,
std::map<int, std::vector<int>>* indices, int* num_nmsed_out, std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
const int& background_label, const int& nms_top_k, const int& background_label, const int& nms_top_k,
const int& keep_top_k, const T& nms_threshold, const int& keep_top_k, const T& nms_threshold,
...@@ -141,7 +145,7 @@ void MultiClassNMS(const Tensor& scores, const Tensor& bboxes, ...@@ -141,7 +145,7 @@ void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
int num_det = 0; int num_det = 0;
for (int64_t c = 0; c < class_num; ++c) { for (int64_t c = 0; c < class_num; ++c) {
if (c == background_label) continue; if (c == background_label) continue;
Tensor score = scores.Slice(c, c + 1); framework::Tensor score = scores.Slice(c, c + 1);
/// [c] is key /// [c] is key
NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta, NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
nms_top_k, &((*indices)[c])); nms_top_k, &((*indices)[c]));
...@@ -181,9 +185,10 @@ void MultiClassNMS(const Tensor& scores, const Tensor& bboxes, ...@@ -181,9 +185,10 @@ void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
} }
template <typename T> template <typename T>
void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, void MultiClassOutput(const framework::Tensor& scores,
const framework::Tensor& bboxes,
const std::map<int, std::vector<int>>& selected_indices, const std::map<int, std::vector<int>>& selected_indices,
Tensor* outs) { framework::Tensor* outs) {
int predict_dim = scores.dims()[1]; int predict_dim = scores.dims()[1];
auto* scores_data = scores.data<T>(); auto* scores_data = scores.data<T>();
auto* bboxes_data = bboxes.data<T>(); auto* bboxes_data = bboxes.data<T>();
...@@ -231,10 +236,10 @@ void MultiClassNMSCompute(const MultiClassNMSParam& param) { ...@@ -231,10 +236,10 @@ void MultiClassNMSCompute(const MultiClassNMSParam& param) {
std::vector<std::map<int, std::vector<int>>> all_indices; std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<size_t> batch_starts = {0}; std::vector<size_t> batch_starts = {0};
for (int64_t i = 0; i < batch_size; ++i) { for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = input_scores->Slice(i, i + 1); framework::Tensor ins_score = input_scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim}); ins_score.Resize({class_num, predict_dim});
Tensor ins_boxes = input_bboxes->Slice(i, i + 1); framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
ins_boxes.Resize({predict_dim, box_dim}); ins_boxes.Resize({predict_dim, box_dim});
std::map<int, std::vector<int>> indices; std::map<int, std::vector<int>> indices;
...@@ -253,16 +258,16 @@ void MultiClassNMSCompute(const MultiClassNMSParam& param) { ...@@ -253,16 +258,16 @@ void MultiClassNMSCompute(const MultiClassNMSParam& param) {
} else { } else {
outs->mutable_data<float>({num_kept, kOutputDim}); outs->mutable_data<float>({num_kept, kOutputDim});
for (int64_t i = 0; i < batch_size; ++i) { for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = input_scores->Slice(i, i + 1); framework::Tensor ins_score = input_scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim}); ins_score.Resize({class_num, predict_dim});
Tensor ins_boxes = input_bboxes->Slice(i, i + 1); framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
ins_boxes.Resize({predict_dim, box_dim}); ins_boxes.Resize({predict_dim, box_dim});
int64_t s = batch_starts[i]; int64_t s = batch_starts[i];
int64_t e = batch_starts[i + 1]; int64_t e = batch_starts[i + 1];
if (e > s) { if (e > s) {
Tensor out = outs->Slice(s, e); framework::Tensor out = outs->Slice(s, e);
MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out); MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
} }
} }
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#pragma once #pragma once
#include <operators/math/transform.h> #include <operators/math/transform.h>
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "operators/kernel/reshape_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef SOFTMAX_OP #ifdef SOFTMAX_OP
#pragma once #pragma once
#include "../../math/softmax.h" #include "../../math/softmax.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename P> template <typename P>
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -24,13 +24,13 @@ template <> ...@@ -24,13 +24,13 @@ template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) { bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
bool relu_enabled = false; bool relu_enabled = false;
const Tensor *input = param->Input(); const Tensor *input = param->Input();
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
const Tensor *filter = param->Filter(); const Tensor *filter = param->Filter();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<float>();
Tensor *out = param->Output(); Tensor *out = param->Output();
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
......
...@@ -24,13 +24,13 @@ template <> ...@@ -24,13 +24,13 @@ template <>
bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) { bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) {
bool relu_enabled = true; bool relu_enabled = true;
const Tensor *input = param->Input(); const Tensor *input = param->Input();
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
const Tensor *filter = param->Filter(); const Tensor *filter = param->Filter();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<float>();
Tensor *out = param->Output(); Tensor *out = param->Output();
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
......
...@@ -24,13 +24,13 @@ template <> ...@@ -24,13 +24,13 @@ template <>
bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam *param) { bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam *param) {
bool relu_enabled = true; bool relu_enabled = true;
const Tensor *input = param->Input(); const Tensor *input = param->Input();
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
const Tensor *filter = param->Filter(); const Tensor *filter = param->Filter();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<float>();
Tensor *out = param->Output(); Tensor *out = param->Output();
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0],
"Image channel should be equal to bias number"); "Image channel should be equal to bias number");
......
...@@ -25,9 +25,9 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -25,9 +25,9 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
const Tensor *input_x = param->InputX(); const Tensor *input_x = param->InputX();
const Tensor *input_y = param->InputY(); const Tensor *input_y = param->InputY();
Tensor *out = param->Out(); Tensor *out = param->Out();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<float>(); auto input_y_ptr = input_y->data<half>();
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs; fpga::EWAddArgs ewaddArgs;
ewaddArgs.relu_enabled = relu_enabled; ewaddArgs.relu_enabled = relu_enabled;
......
...@@ -22,13 +22,13 @@ template <> ...@@ -22,13 +22,13 @@ template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam *param) { bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam *param) {
bool relu_enabled = true; bool relu_enabled = true;
const Tensor *input_x = param->InputX(); const Tensor *input_x = param->InputX();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<half>();
const Tensor *input_y = param->InputY(); const Tensor *input_y = param->InputY();
auto input_y_ptr = input_y->data<float>(); auto input_y_ptr = input_y->data<float>();
const Tensor *input_z = param->InputZ(); const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out(); Tensor *out = param->Out();
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
"Image channel should be equal to weight number"); "Image channel should be equal to weight number");
......
...@@ -22,13 +22,13 @@ template <> ...@@ -22,13 +22,13 @@ template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam *param) { bool FusionFcKernel<FPGA, float>::Init(FusionFcParam *param) {
bool relu_enabled = false; bool relu_enabled = false;
const Tensor *input_x = param->InputX(); const Tensor *input_x = param->InputX();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<half>();
const Tensor *input_y = param->InputY(); const Tensor *input_y = param->InputY();
auto input_y_ptr = input_y->data<float>(); auto input_y_ptr = input_y->data<float>();
const Tensor *input_z = param->InputZ(); const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out(); Tensor *out = param->Out();
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
"Image channel should be equal to weight number"); "Image channel should be equal to weight number");
......
...@@ -22,9 +22,9 @@ namespace operators { ...@@ -22,9 +22,9 @@ namespace operators {
template <> template <>
bool PoolKernel<FPGA, float>::Init(PoolParam *param) { bool PoolKernel<FPGA, float>::Init(PoolParam *param) {
const Tensor *input = param->Input(); const Tensor *input = param->Input();
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
Tensor *output = param->Output(); Tensor *output = param->Output();
auto output_ptr = output->mutable_data<float>(); auto output_ptr = output->mutable_data<half>();
vector<int> ksize = param->Ksize(); vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides(); vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
......
此差异已折叠。
...@@ -50,6 +50,10 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, ...@@ -50,6 +50,10 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor) // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
...@@ -58,10 +62,19 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -58,10 +62,19 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu); float beta, float *c, float *C, int ldc, bool relu);
void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *bias);
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc, const float *b, float beta, float *c, float *C, int ldc,
...@@ -91,8 +104,13 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc); ...@@ -91,8 +104,13 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc); void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + bias ,relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias);
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
float *new_bias); float *new_bias);
...@@ -120,13 +138,24 @@ void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, ...@@ -120,13 +138,24 @@ void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu); const float *B, int ldb, float beta, float *C, int ldc, bool relu,
float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); bool relu, float *new_scale, float *new_bias);
// 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -22,7 +22,8 @@ namespace math { ...@@ -22,7 +22,8 @@ namespace math {
template <> template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu) { framework::Tensor *matrix_out, float beta, bool relu,
float *bias) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -41,8 +42,13 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -41,8 +42,13 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
N, beta, matrix_out->data<float>(), N, relu, bias);
#else
Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N, relu); beta, matrix_out->data<float>(), N, relu, bias);
#endif
} }
template <> template <>
...@@ -69,10 +75,17 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -69,10 +75,17 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP
SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, new_scale->data<float>() + group,
new_bias->data<float>() + group);
#else
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
N, beta, matrix_out->data<float>(), N, relu, N, beta, matrix_out->data<float>(), N, relu,
new_scale->data<float>() + group, new_scale->data<float>() + group,
new_bias->data<float>() + group); new_bias->data<float>() + group);
#endif
} }
} // namespace math } // namespace math
......
...@@ -21,11 +21,11 @@ namespace paddle_mobile { ...@@ -21,11 +21,11 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
// matrix multiply with continuous memory
template <typename T> template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false); framework::Tensor *matrix_out, T beta, bool relu = false,
float *bias = nullptr);
template <typename T> template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
......
...@@ -665,6 +665,16 @@ class FeedParam : public OpParam { ...@@ -665,6 +665,16 @@ class FeedParam : public OpParam {
Tensor *input_x_; Tensor *input_x_;
Tensor *out_; Tensor *out_;
int batch_size; int batch_size;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::BypassArgs fpga_bypass_args;
public:
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif
}; };
class FetchParam : public OpParam { class FetchParam : public OpParam {
......
...@@ -49,9 +49,9 @@ int main() { ...@@ -49,9 +49,9 @@ int main() {
auto time1 = time(); auto time1 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>(aa, false, bb, false, paddle_mobile::operators::math::matmul<float>(
static_cast<float>(1), &cc, aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
static_cast<float>(0), false); false, biasptr);
// paddle_mobile::operators::math::matmulWithBn<float>( // paddle_mobile::operators::math::matmulWithBn<float>(
// aa, false, bb, false, static_cast<float>(1), &cc, // aa, false, bb, false, static_cast<float>(1), &cc,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册