未验证 提交 8aac7d9d 编写于 作者: X xiebaiyuan 提交者: GitHub

Merge branch 'develop' into dev-latest

...@@ -57,7 +57,12 @@ class RawData { ...@@ -57,7 +57,12 @@ class RawData {
public: public:
char data[size]; char data[size];
RawData() {} RawData() {}
RawData(const RawData &raw_data) { strcpy(data, raw_data.data); } RawData(const RawData &raw_data) { memcpy(data, raw_data.data, size); }
RawData &operator=(const RawData &raw_data) {
memcpy(data, raw_data.data, size);
return *this;
}
}; };
template <typename... Ts> template <typename... Ts>
...@@ -74,14 +79,36 @@ struct Variant { ...@@ -74,14 +79,36 @@ struct Variant {
template <typename T, typename... Args> template <typename T, typename... Args>
void Set(Args &&... args) { void Set(Args &&... args) {
helper::Destroy(type_id, &data); helper::Destroy(type_id, &data.data);
new (&data) T(std::forward<Args>(args)...); new (&data.data) T(std::forward<Args>(args)...);
type_id = typeid(T).hash_code(); type_id = typeid(T).hash_code();
} }
void SetString(std::string &string) {
// helper::Destroy(type_id, &data);
type_id = typeid(std::string).hash_code();
strcpy(data.data, string.c_str());
}
std::string GetString() const {
if (type_id == typeid(std::string).hash_code()) {
return std::string(data.data);
} else {
PADDLE_MOBILE_THROW_EXCEPTION(
" bad cast in variant data type not a string ");
exit(0);
}
}
template <typename T> template <typename T>
T &Get() const { T &Get() const {
if (type_id == typeid(T).hash_code()) { if (type_id == typeid(std::string).hash_code()) {
PADDLE_MOBILE_THROW_EXCEPTION(
"Please use getString to get an string (to avoid of an issue with "
"gcc "
"stl lib with string copy)");
exit(0);
} else if (type_id == typeid(T).hash_code()) {
return *const_cast<T *>(reinterpret_cast<const T *>(&data)); return *const_cast<T *>(reinterpret_cast<const T *>(&data));
} else { } else {
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant"); PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
......
...@@ -104,7 +104,7 @@ int fpga_invalidate(void *address, size_t size) { ...@@ -104,7 +104,7 @@ int fpga_invalidate(void *address, size_t size) {
} }
half fp32_2_fp16(float fp32_num) { half fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10)); (((tmp & 0x7f800000) >> 13) - (112 << 10));
if (tmp & 0x1000) { if (tmp & 0x1000) {
...@@ -120,7 +120,7 @@ float fp16_2_fp32(half fp16_num) { ...@@ -120,7 +120,7 @@ float fp16_2_fp32(half fp16_num) {
int tmp = 0; int tmp = 0;
float fp32_num; float fp32_num;
tmp = s << 16 | exp << 23 | frac << 13; tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; fp32_num = *(float *)&tmp; // NOLINT
return fp32_num; return fp32_num;
} }
...@@ -347,6 +347,20 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, ...@@ -347,6 +347,20 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
} }
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT
auto dims = filter_tensor->dims();
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size);
filter::format_fc_filter(&new_data, num, channel, height, width, 1,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_bias_scale_array(float **bias_scale_array, void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) { int element_num_per_division, int num) {
bias_scale::format_bias_scale_array(bias_scale_array, bias_scale::format_bias_scale_array(bias_scale_array,
......
...@@ -109,8 +109,8 @@ struct PoolingArgs { ...@@ -109,8 +109,8 @@ struct PoolingArgs {
struct EWAddArgs { struct EWAddArgs {
bool relu_enabled; bool relu_enabled;
half const0; // output0 = const0 x input0 + const1 x input1; uint32_t const0; // output0 = const0 x input0 + const1 x input1;
half const1; uint32_t const1;
struct ImageInputArgs image0; struct ImageInputArgs image0;
struct ImageInputArgs image1; struct ImageInputArgs image1;
struct ImageOutputArgs output; struct ImageOutputArgs output;
...@@ -214,6 +214,7 @@ int get_aligned_filter_element_num(int chw); ...@@ -214,6 +214,7 @@ int get_aligned_filter_element_num(int chw);
int get_aligned_filter_num(int num); int get_aligned_filter_num(int num);
void format_filter(framework::Tensor* filter_tensor, float max_value, void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num); int group_num);
void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
void format_bias_scale_array(float** bias_scale_array, void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num); int element_num_per_division, int num);
void format_concat_output(framework::Tensor* out, int height, int width, void format_concat_output(framework::Tensor* out, int height, int width,
......
...@@ -225,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -225,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
num_after_alignment * sizeof(char)); num_after_alignment * sizeof(char));
} }
void convert_fc_filter(char **data_in, int num, int chw) {
char *tmp = *data_in;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
for (int n = 0; n < num; n++) {
for (int c = 0; c < chw; c++) {
data_tmp[n * chw + c] = (*data_in)[num * c + n];
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void format_fc_filter(float **data_in, int num, int channel, int height,
int width, int group_num, float max) {
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num;
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_fc_filter(quantize_data, num, chw);
align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw);
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
}
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -25,7 +25,7 @@ int calc_division_capacity(int chw); ...@@ -25,7 +25,7 @@ int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity); int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity); int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity);
void convert_to_hwc(float** data_in, int num, int channel, int height, void convert_to_hwc(char** data_in, int num, int channel, int height,
int width); int width);
float find_max(float* data_in, int data_size); float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max); void quantize(float** data_in, int data_size, float max);
...@@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw); ...@@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw);
void interleave(float** data_in, int num_after_alignment, int chw); void interleave(float** data_in, int num_after_alignment, int chw);
void format_filter(float** data_in, int num, int channel, int height, int width, void format_filter(float** data_in, int num, int channel, int height, int width,
int group_num, float max); int group_num, float max);
void convert_fc_filter(char** data_in, int num, int chw);
void format_fc_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max);
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -51,7 +51,7 @@ class Attribute { ...@@ -51,7 +51,7 @@ class Attribute {
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: {
attr.Set<std::string>(std::string(attr_desc->s)); attr.SetString(std::string(attr_desc->s));
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
...@@ -108,6 +108,13 @@ class Attribute { ...@@ -108,6 +108,13 @@ class Attribute {
return variant_.Get<T>(); return variant_.Get<T>();
} }
Attribute &SetString(std::string string) {
variant_.SetString(string);
return *this;
}
std::string GetString() const { return variant_.GetString(); }
template <typename Vistor> template <typename Vistor>
static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) { static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
if (attr.variant_.TypeId() == typeid(int).hash_code()) { if (attr.variant_.TypeId() == typeid(int).hash_code()) {
...@@ -115,7 +122,7 @@ class Attribute { ...@@ -115,7 +122,7 @@ class Attribute {
} else if (attr.variant_.TypeId() == typeid(float).hash_code()) { } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
return vistor(attr.variant_.Get<float>()); return vistor(attr.variant_.Get<float>());
} else if (attr.variant_.TypeId() == typeid(string).hash_code()) { } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
return vistor(attr.variant_.Get<string>()); return vistor(attr.variant_.GetString());
} else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) { } else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) {
return vistor(attr.variant_.Get<vector<int>>()); return vistor(attr.variant_.Get<vector<int>>());
} else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) { } else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) {
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <cstdlib> #include <cstdlib>
#include <initializer_list> #include <initializer_list>
#include <string>
#include <typeinfo> #include <typeinfo>
#include <vector> #include <vector>
......
...@@ -33,6 +33,13 @@ class Variable { ...@@ -33,6 +33,13 @@ class Variable {
template <typename T> template <typename T>
const T GetValue() const { const T GetValue() const {
if (typeid(T) == typeid(std::string)) {
PADDLE_MOBILE_THROW_EXCEPTION(
"Please use getString to get an string (to avoid of an issue with "
"gcc "
"stl lib with string copy)");
exit(0);
}
return variant.Get<T>(); return variant.Get<T>();
} }
......
...@@ -101,6 +101,11 @@ bool PaddleMobilePredictor<Dtype, P>::Run( ...@@ -101,6 +101,11 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
return true; return true;
} }
template <typename Dtype, Precision P>
PaddleMobilePredictor<Dtype, P>::~PaddleMobilePredictor() {
paddle_mobile_->Clear();
}
// A factory to help create difference predictor. // A factory to help create difference predictor.
template <> template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor>
......
...@@ -32,7 +32,7 @@ namespace paddle_mobile { ...@@ -32,7 +32,7 @@ namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32> template <typename Dtype = CPU, Precision P = Precision::FP32>
class PaddleMobilePredictor : public PaddlePredictor { class PaddleMobilePredictor : public PaddlePredictor {
public: public:
PaddleMobilePredictor() {} PaddleMobilePredictor() = delete;
explicit PaddleMobilePredictor(const PaddleMobileConfig& config); explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
...@@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor { ...@@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) override; int batch_size = -1) override;
~PaddleMobilePredictor() override{}; ~PaddleMobilePredictor() override;
private: private:
std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_; std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
......
...@@ -87,7 +87,6 @@ enum class PaddleEngineKind { ...@@ -87,7 +87,6 @@ enum class PaddleEngineKind {
class PaddlePredictor { class PaddlePredictor {
public: public:
struct Config; struct Config;
PaddlePredictor() = default;
PaddlePredictor(const PaddlePredictor&) = delete; PaddlePredictor(const PaddlePredictor&) = delete;
PaddlePredictor& operator=(const PaddlePredictor&) = delete; PaddlePredictor& operator=(const PaddlePredictor&) = delete;
...@@ -107,6 +106,9 @@ class PaddlePredictor { ...@@ -107,6 +106,9 @@ class PaddlePredictor {
struct Config { struct Config {
std::string model_dir; // path to the model directory. std::string model_dir; // path to the model directory.
}; };
protected:
PaddlePredictor() = default;
}; };
struct PaddleMobileConfig : public PaddlePredictor::Config { struct PaddleMobileConfig : public PaddlePredictor::Config {
......
...@@ -46,7 +46,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { ...@@ -46,7 +46,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1); fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
......
...@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1); fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#include "operators/kernel/mul_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
bool relu_enabled = false;
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto filter = const_cast<LoDTensor *>(param->InputY());
auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = 0;
}
int num = (uint32_t)filter->dims()[1];
int chw = (uint32_t)filter->dims()[0];
PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(),
"Filter element num should be equal to IFM element num");
int height = (uint32_t)input_x->dims()[2];
int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width;
filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
0, bs_ptr);
param->SetFpgaArgs(conv_arg);
return true;
}
template <>
void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
#endif
此差异已折叠。
...@@ -35,7 +35,9 @@ namespace paddle_mobile { ...@@ -35,7 +35,9 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
/* class Gemm {
public:
/*
// 将 A 矩阵分块复制到连续内存(ColMajor) // 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
...@@ -44,138 +46,156 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, ...@@ -44,138 +46,156 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
*/ */
typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
// 将 A 矩阵分块复制到连续内存(RowMajor) typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, int);
FnPack procPackA;
FnPack procPackB;
FnAddDot procAddDot;
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor) // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu); float beta, float *c, float *C, int ldc, bool relu);
void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *bias); int ldc, bool relu, float *bias);
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale, float *new_bias, int ldc, bool relu, float *new_scale, float *new_bias);
float *bias); void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale,
float *new_bias, float *bias);
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
/* /*
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu); bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C, int lda, const float *B, int ldb, float beta, float
int ldc, bool relu, float *new_scale, float *new_bias); *C, int ldc, bool relu, float *new_scale, float *new_bias);
*/ */
// 计算一个更小的 C 矩阵分块 // 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc); void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc); void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc); void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写 // 分块矩阵乘法结果回写
// C = A * B // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc); void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc); void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + bias // C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias); void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C,prelu(C) // C = A * B + C,prelu(C)
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
// C = A * B + bias ,relu(C) // C = A * B + bias ,relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias); float *bias);
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias); float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1); float *new_scale, float *new_bias, float *bias1);
/* /*
// 向量矩阵乘法结果回写 // 向量矩阵乘法结果回写
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc); void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc); void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C // C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc); void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc); void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias); float *new_bias);
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias); float *new_bias);
*/ */
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu, const float *B, int ldb, float beta, float *C, int ldc, bool relu,
float *bias); float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias); bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
// 32位 float 矩阵乘法(openmp 多线程版本) // 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias); bool relu, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta, float *C,
bool relu, float *new_scale, float *new_bias, float *bias); int ldc, bool relu, float *new_scale, float *new_bias,
float *bias);
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
private:
int MC = 0;
int KC = 0;
int NC = 0;
float *packedA;
float *packedB;
float *packedC;
float *zero;
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> { ...@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
static void compute(GRUMetaValue<T> value, int frame_size, int batch_size, static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
const ActivationType active_node, const ActivationType active_node,
const ActivationType active_gate) { const ActivationType active_gate) {
Gemm gemm;
if (value.prev_out_value) { if (value.prev_out_value) {
Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, value.prev_out_value, frame_size, value.gate_weight,
frame_size * 3, false, nullptr); frame_size * 2, 1, value.gate_value, frame_size * 3, false,
nullptr);
} }
forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size, forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
batch_size, active_gate); batch_size, active_gate);
if (value.prev_out_value) { if (value.prev_out_value) {
Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value, gemm.Sgemm(batch_size, frame_size, frame_size, 1,
frame_size, value.state_weight, frame_size, 1, value.reset_output_value, frame_size, value.state_weight,
value.gate_value + frame_size * 2, frame_size * 3, false, nullptr); frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, nullptr);
} }
forward_final_output(forward::gru_finalOutput<T>(), value, frame_size, forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
......
...@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemm gemm;
if (trans_a) { if (trans_a) {
int numel = matrix_a.numel(); int numel = matrix_a.numel();
...@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
a[index++] = tmp[i * n + j]; a[index++] = tmp[i * n + j];
} }
} }
#ifdef _OPENMP #ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); matrix_out->data<float>(), N, relu, bias);
#else #else
Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta, gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); matrix_out->data<float>(), N, relu, bias);
#endif #endif
} else { } else {
#ifdef _OPENMP #ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
N, beta, matrix_out->data<float>(), N, relu, bias); matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
N, relu, bias);
#else #else
Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
beta, matrix_out->data<float>(), N, relu, bias); matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, bias);
#endif #endif
} }
} }
...@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
float alpha, framework::Tensor *matrix_out, float beta, float alpha, framework::Tensor *matrix_out, float beta,
bool relu, framework::Tensor *new_scale, bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias, int group, float *bias) { framework::Tensor *new_bias, int group, float *bias) {
Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP #ifdef _OPENMP
SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K, gemm.SgemmWithBn_omp(
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N, M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
relu, new_scale->data<float>() + group, beta, matrix_out->data<float>(), N, relu,
new_bias->data<float>() + group, bias); new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
#else #else
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
N, beta, matrix_out->data<float>(), N, relu, matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
new_scale->data<float>() + group, new_bias->data<float>() + group, N, relu, new_scale->data<float>() + group,
bias); new_bias->data<float>() + group, bias);
#endif #endif
} }
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float *p, std::string mode, framework::Tensor *matrix_out, float *p, std::string mode,
float *bias, float *bias1) { float *bias, float *bias1) {
Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, ...@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP #ifdef _OPENMP
SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
N, matrix_out->data<float>(), N, p, mode, bias, bias1); matrix_b.data<float>(), N, matrix_out->data<float>(),
N, p, mode, bias, bias1);
#else #else
SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N, gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
matrix_out->data<float>(), N, p, mode, bias, bias1); matrix_b.data<float>(), N, matrix_out->data<float>(), N,
p, mode, bias, bias1);
#endif #endif
} }
......
...@@ -61,5 +61,7 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp); ...@@ -61,5 +61,7 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp); REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
#endif
#endif #endif
...@@ -263,6 +263,10 @@ class OpParam { ...@@ -263,6 +263,10 @@ class OpParam {
static const T GetAttr(const string &key, const AttributeMap &map) { static const T GetAttr(const string &key, const AttributeMap &map) {
return ((Attribute)map.at(key)).Get<T>(); return ((Attribute)map.at(key)).Get<T>();
} }
static const std::string GetStringAttr(const string &key,
const AttributeMap &map) {
return ((Attribute)map.at(key)).GetString();
}
static const bool HasAttr(const string &key, const AttributeMap &map) { static const bool HasAttr(const string &key, const AttributeMap &map) {
return map.count(key) > 0; return map.count(key) > 0;
...@@ -438,6 +442,15 @@ class MulParam : OpParam { ...@@ -438,6 +442,15 @@ class MulParam : OpParam {
GType *out_; GType *out_;
int x_num_col_dims_; int x_num_col_dims_;
int y_num_col_dims_; int y_num_col_dims_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -493,7 +506,7 @@ class LrnParam : public OpParam { ...@@ -493,7 +506,7 @@ class LrnParam : public OpParam {
alpha_ = GetAttr<float>("alpha", attrs); alpha_ = GetAttr<float>("alpha", attrs);
beta_ = GetAttr<float>("beta", attrs); beta_ = GetAttr<float>("beta", attrs);
k_ = GetAttr<float>("k", attrs); k_ = GetAttr<float>("k", attrs);
data_format_ = GetAttr<string>("data_format", attrs); data_format_ = GetStringAttr("data_format", attrs);
} }
const RType *InputX() const { return input_x_; } const RType *InputX() const { return input_x_; }
...@@ -590,7 +603,7 @@ class PoolParam : public OpParam { ...@@ -590,7 +603,7 @@ class PoolParam : public OpParam {
input_ = InputXFrom<GType>(inputs, scope); input_ = InputXFrom<GType>(inputs, scope);
output_ = OutFrom<GType>(outputs, scope); output_ = OutFrom<GType>(outputs, scope);
pooling_type_ = GetAttr<string>("pooling_type", attrs); pooling_type_ = GetStringAttr("pooling_type", attrs);
ksize_ = GetAttr<vector<int>>("ksize", attrs); ksize_ = GetAttr<vector<int>>("ksize", attrs);
strides_ = GetAttr<vector<int>>("strides", attrs); strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs); paddings_ = GetAttr<vector<int>>("paddings", attrs);
...@@ -724,7 +737,7 @@ class BoxCoderParam : public OpParam { ...@@ -724,7 +737,7 @@ class BoxCoderParam : public OpParam {
input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, scope); input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, scope);
input_targetbox_ = InputTargetBoxFrom<GType>(inputs, scope); input_targetbox_ = InputTargetBoxFrom<GType>(inputs, scope);
output_box_ = OutputBoxFrom<GType>(outputs, scope); output_box_ = OutputBoxFrom<GType>(outputs, scope);
code_type_ = GetAttr<std::string>("code_type", attrs); code_type_ = GetStringAttr("code_type", attrs);
} }
const RType *InputPriorBox() const { return input_priorbox_; } const RType *InputPriorBox() const { return input_priorbox_; }
...@@ -1199,7 +1212,7 @@ class PReluParam : public OpParam { ...@@ -1199,7 +1212,7 @@ class PReluParam : public OpParam {
alpha_ = InputAlphaFrom<GType>(inputs, scope); alpha_ = InputAlphaFrom<GType>(inputs, scope);
framework::DDim dims = alpha_->dims(); framework::DDim dims = alpha_->dims();
out_ = OutFrom<GType>(outputs, scope); out_ = OutFrom<GType>(outputs, scope);
mode_ = GetAttr<std::string>("mode", attrs); mode_ = GetStringAttr("mode", attrs);
DLOG << "PReluParam mode after" << mode_; DLOG << "PReluParam mode after" << mode_;
} }
const RType *InputX() const { return input_x_; } const RType *InputX() const { return input_x_; }
...@@ -1330,7 +1343,7 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> { ...@@ -1330,7 +1343,7 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
const AttributeMap &attrs, const Scope &scope) const AttributeMap &attrs, const Scope &scope)
: ConvParam<Dtype>(inputs, outputs, attrs, scope) { : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope); alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
mode_ = OpParam::GetAttr<std::string>("mode", attrs); mode_ = OpParam::GetStringAttr("mode", attrs);
framework::DDim dims = alpha_->dims(); framework::DDim dims = alpha_->dims();
bias_ = OpParam::InputYFrom<GType>(inputs, scope); bias_ = OpParam::InputYFrom<GType>(inputs, scope);
axis_ = OpParam::GetAttr<int>("axis", attrs); axis_ = OpParam::GetAttr<int>("axis", attrs);
...@@ -1373,7 +1386,7 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> { ...@@ -1373,7 +1386,7 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
: ConvParam<Dtype>(inputs, outputs, attrs, scope) { : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
bias1_ = OpParam::InputYFrom1<GType>(inputs, scope); bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope); alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
mode_ = OpParam::GetAttr<std::string>("mode", attrs); mode_ = OpParam::GetStringAttr("mode", attrs);
framework::DDim dims = alpha_->dims(); framework::DDim dims = alpha_->dims();
bias_ = OpParam::InputYFrom<GType>(inputs, scope); bias_ = OpParam::InputYFrom<GType>(inputs, scope);
output_ = OpParam::OutFrom<GType>(outputs, scope); output_ = OpParam::OutFrom<GType>(outputs, scope);
...@@ -1980,8 +1993,8 @@ class GruParam : public OpParam { ...@@ -1980,8 +1993,8 @@ class GruParam : public OpParam {
OutputBatchResetHiddenPrevFrom<GType>(outputs, scope); OutputBatchResetHiddenPrevFrom<GType>(outputs, scope);
output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, scope); output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, scope);
output_hidden_ = OutputHiddenFrom<GType>(outputs, scope); output_hidden_ = OutputHiddenFrom<GType>(outputs, scope);
activation_ = GetAttr<std::string>("activation", attrs); activation_ = GetStringAttr("activation", attrs);
gate_activation_ = GetAttr<std::string>("gate_activation", attrs); gate_activation_ = GetStringAttr("gate_activation", attrs);
is_reverse_ = GetAttr<bool>("is_reverse", attrs); is_reverse_ = GetAttr<bool>("is_reverse", attrs);
} }
const GType *InputInput() const { return input_input_; } const GType *InputInput() const { return input_input_; }
......
...@@ -35,8 +35,8 @@ if (CON GREATER -1) ...@@ -35,8 +35,8 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile) target_link_libraries(test-yolo paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test_yolo_combined paddle-mobile) target_link_libraries(test-yolo-combined paddle-mobile)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif () endif ()
...@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH) ...@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH)
target_link_libraries(test-fssd paddle-mobile) target_link_libraries(test-fssd paddle-mobile)
# gen test
ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
target_link_libraries(test-multi-process paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif () endif ()
...@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { ...@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
} }
} }
paddle_mobile::operators::math::SgemmWithBn( paddle_mobile::operators::math::Gemm gemm;
m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr); gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
nullptr);
int eq = 0; int eq = 0;
int neq = 0; int neq = 0;
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
......
...@@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50"; ...@@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50";
int main() { int main() {
DLOG << paddle_mobile::fpga::open_device(); DLOG << paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model", // if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
std::string(g_resnet_combine) + "/params", true)) { // std::string(g_resnet_combine) + "/params", true)) {
if (paddle_mobile.Load(std::string(g_resnet_combine), true)) {
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor; Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0), SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
......
...@@ -46,7 +46,12 @@ int main() { ...@@ -46,7 +46,12 @@ int main() {
tensor_out.dtype = PaddleDType::FLOAT32; tensor_out.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> outputs(1, tensor_out); std::vector<PaddleTensor> outputs(1, tensor_out);
assert(predictor->Run(paddle_tensor_feeds, &outputs)); std::cout << " before predict " << std::endl;
predictor->Run(paddle_tensor_feeds, &outputs);
std::cout << " after predict " << std::endl;
// assert();
float* data_o = static_cast<float*>(outputs[0].data.data()); float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) { for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <thread> // NOLINT
#include "../test_helper.h"
#include "../test_include.h"
void fun_yolo();
int fun_mobilenet();
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
// fun_yolo();
// fun_mobilenet();
std::thread t1(fun_yolo);
std::thread t2(fun_mobilenet);
t1.join();
t2.join();
return 0;
}
void fun_yolo() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto time1 = time();
if (paddle_mobile.Load(g_yolo, true)) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
vector<int64_t> dims{1, 3, 227, 227};
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
static_cast<float>(1));
vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
auto time3 = time();
for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims);
}
auto time4 = time();
std::cout << "thread 1: predict cost :" << time_diff(time3, time4) / 10
<< "ms" << std::endl;
}
}
int fun_mobilenet() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, true);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
vector<float> input;
vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
auto biggest = max_element(begin(vec_result), end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< distance(begin(vec_result), biggest) << std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
DLOG << vec_result;
auto time4 = time();
std::cout << "thread 2: predict cost :" << time_diff(time3, time4) / 10
<< "ms" << std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
...@@ -60,7 +60,15 @@ int main() { ...@@ -60,7 +60,15 @@ int main() {
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
// 1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479 // 1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
std::vector<int64_t> ids{1791, 656, 1549, 281, 96}; std::vector<int64_t> ids{
2084, 635, 1035, 197, 990, 150, 1132, 2403, 546, 770, 4060, 3352,
1798, 1589, 1352, 98, 136, 3461, 3186, 1159, 515, 764, 278, 1178,
5044, 4060, 943, 932, 463, 1198, 3352, 374, 1198, 3352, 374, 2047,
1069, 1589, 3672, 1178, 1178, 2165, 1178, 2084, 635, 3087, 2236, 546,
2047, 1549, 546, 2047, 302, 2202, 398, 804, 397, 657, 804, 866,
932, 2084, 515, 2165, 397, 302, 2202, 526, 992, 906, 1215, 1589,
4493, 2403, 723, 932, 2084, 635, 1352, 932, 444, 2047, 1159, 1893,
1579, 59, 330, 98, 1296, 1159, 3430, 738, 3186, 1071, 2174, 3933};
paddle_mobile::framework::LoDTensor words; paddle_mobile::framework::LoDTensor words;
auto size = static_cast<int>(ids.size()); auto size = static_cast<int>(ids.size());
......
...@@ -52,8 +52,8 @@ int main() { ...@@ -52,8 +52,8 @@ int main() {
#else #else
auto time3 = time(); auto time3 = time();
paddle_mobile.FeedData(input_tensor); paddle_mobile.FeedData(input_tensor);
paddle_mobile.Predict_To(10); paddle_mobile.Predict_To(-1);
paddle_mobile.Predict_From(10); /*paddle_mobile.Predict_From(10);
auto tensor_ptr = paddle_mobile.FetchResult(9); auto tensor_ptr = paddle_mobile.FetchResult(9);
std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel() std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
<< std::endl; << std::endl;
...@@ -63,7 +63,7 @@ int main() { ...@@ -63,7 +63,7 @@ int main() {
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) << "ms" std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl; << std::endl;*/
#endif #endif
} }
return 0; return 0;
......
...@@ -46,7 +46,7 @@ class TestBoxCoderOp { ...@@ -46,7 +46,7 @@ class TestBoxCoderOp {
DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0]; DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0];
DLOG << " OutputBox is : " << op->Output("OutputBox")[0]; DLOG << " OutputBox is : " << op->Output("OutputBox")[0];
DLOG << " code_type : " DLOG << " code_type : "
<< op->GetAttrMap().at("code_type").Get<std::string>(); << op->GetAttrMap().at("code_type").GetString();
std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder = std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
std::make_shared<operators::BoxCoderOp<Dtype, float>>( std::make_shared<operators::BoxCoderOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(), op->Type(), op->GetInputs(), op->GetOutputs(),
......
...@@ -121,6 +121,7 @@ if (CON GREATER -1) ...@@ -121,6 +121,7 @@ if (CON GREATER -1)
set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBNRELU_OP ON)
set(FUSION_CONVBN_OP ON) set(FUSION_CONVBN_OP ON)
set(FUSION_CONVADD_OP ON) set(FUSION_CONVADD_OP ON)
set(MUL_OP ON)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册