未验证 提交 c38dbd3c 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge branch 'develop' into develop

...@@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ...@@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
- **ARM CPU** - **ARM CPU**
![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_18.png) ![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_29.png)
arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
......
...@@ -17,39 +17,39 @@ limitations under the License. */ ...@@ -17,39 +17,39 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
const std::string G_OP_TYPE_CONV = "conv2d"; const char *G_OP_TYPE_CONV = "conv2d";
const std::string G_OP_TYPE_BATCHNORM = "batch_norm"; const char *G_OP_TYPE_BATCHNORM = "batch_norm";
const std::string G_OP_TYPE_BOX_CODER = "box_coder"; const char *G_OP_TYPE_BOX_CODER = "box_coder";
const std::string G_OP_TYPE_CONCAT = "concat"; const char *G_OP_TYPE_CONCAT = "concat";
const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const std::string G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const std::string G_OP_TYPE_FC = "fusion_fc"; const char *G_OP_TYPE_FC = "fusion_fc";
const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const std::string G_OP_TYPE_LRN = "lrn"; const char *G_OP_TYPE_LRN = "lrn";
const std::string G_OP_TYPE_MUL = "mul"; const char *G_OP_TYPE_MUL = "mul";
const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms"; const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const std::string G_OP_TYPE_POOL2D = "pool2d"; const char *G_OP_TYPE_POOL2D = "pool2d";
const std::string G_OP_TYPE_PRIOR_BOX = "prior_box"; const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
const std::string G_OP_TYPE_RELU = "relu"; const char *G_OP_TYPE_RELU = "relu";
const std::string G_OP_TYPE_RESHAPE = "reshape"; const char *G_OP_TYPE_RESHAPE = "reshape";
const std::string G_OP_TYPE_SIGMOID = "sigmoid"; const char *G_OP_TYPE_SIGMOID = "sigmoid";
const std::string G_OP_TYPE_SOFTMAX = "softmax"; const char *G_OP_TYPE_SOFTMAX = "softmax";
const std::string G_OP_TYPE_TRANSPOSE = "transpose"; const char *G_OP_TYPE_TRANSPOSE = "transpose";
const std::string G_OP_TYPE_SPLIT = "split"; const char *G_OP_TYPE_SPLIT = "split";
const std::string G_OP_TYPE_FEED = "feed"; const char *G_OP_TYPE_FEED = "feed";
const std::string G_OP_TYPE_FETCH = "fetch"; const char *G_OP_TYPE_FETCH = "fetch";
const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d"; const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence"; const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
const std::string G_OP_TYPE_DROPOUT = "dropout"; const char *G_OP_TYPE_DROPOUT = "dropout";
const std::string G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn"; const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
const std::string G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn"; const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU = const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
"fusion_elementwise_add_relu"; "fusion_elementwise_add_relu";
const std::string G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu"; const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
const std::string G_OP_TYPE_REGION = "region"; const char *G_OP_TYPE_REGION = "region";
std::unordered_map< std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
...@@ -20,7 +20,9 @@ limitations under the License. */ ...@@ -20,7 +20,9 @@ limitations under the License. */
#include <vector> #include <vector>
namespace paddle_mobile { namespace paddle_mobile {
enum class Precision : int { FP32 = 0 }; enum class Precision : int { FP32 = 0, FP16 = 1 };
typedef int16_t half;
template <Precision p> template <Precision p>
struct PrecisionTrait { struct PrecisionTrait {
...@@ -31,6 +33,10 @@ template <> ...@@ -31,6 +33,10 @@ template <>
struct PrecisionTrait<Precision::FP32> { struct PrecisionTrait<Precision::FP32> {
typedef float ptype; typedef float ptype;
}; };
template <>
struct PrecisionTrait<Precision::FP16> {
typedef half ptype;
};
//! device type //! device type
enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 }; enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
...@@ -73,40 +79,40 @@ enum PMStatus { ...@@ -73,40 +79,40 @@ enum PMStatus {
PMWrongDevice = 0x08 /*!< un-correct device. */ PMWrongDevice = 0x08 /*!< un-correct device. */
}; };
extern const std::string G_OP_TYPE_CONV; extern const char *G_OP_TYPE_CONV;
extern const std::string G_OP_TYPE_BATCHNORM; extern const char *G_OP_TYPE_BATCHNORM;
extern const std::string G_OP_TYPE_BOX_CODER; extern const char *G_OP_TYPE_BOX_CODER;
extern const std::string G_OP_TYPE_CONCAT; extern const char *G_OP_TYPE_CONCAT;
extern const std::string G_OP_TYPE_ELEMENTWISE_ADD; extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const std::string G_OP_TYPE_FC; extern const char *G_OP_TYPE_FC;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD; extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU; extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
extern const std::string G_OP_TYPE_FUSION_CONV_BN_RELU; extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
extern const std::string G_OP_TYPE_LRN; extern const char *G_OP_TYPE_LRN;
extern const std::string G_OP_TYPE_MUL; extern const char *G_OP_TYPE_MUL;
extern const std::string G_OP_TYPE_MULTICLASS_NMS; extern const char *G_OP_TYPE_MULTICLASS_NMS;
extern const std::string G_OP_TYPE_POOL2D; extern const char *G_OP_TYPE_POOL2D;
extern const std::string G_OP_TYPE_PRIOR_BOX; extern const char *G_OP_TYPE_PRIOR_BOX;
extern const std::string G_OP_TYPE_RELU; extern const char *G_OP_TYPE_RELU;
extern const std::string G_OP_TYPE_RESHAPE; extern const char *G_OP_TYPE_RESHAPE;
extern const std::string G_OP_TYPE_SIGMOID; extern const char *G_OP_TYPE_SIGMOID;
extern const std::string G_OP_TYPE_SOFTMAX; extern const char *G_OP_TYPE_SOFTMAX;
extern const std::string G_OP_TYPE_TRANSPOSE; extern const char *G_OP_TYPE_TRANSPOSE;
extern const std::string G_OP_TYPE_SPLIT; extern const char *G_OP_TYPE_SPLIT;
extern const std::string G_OP_TYPE_FEED; extern const char *G_OP_TYPE_FEED;
extern const std::string G_OP_TYPE_FETCH; extern const char *G_OP_TYPE_FETCH;
extern const std::string G_OP_TYPE_DEPTHWISE_CONV; extern const char *G_OP_TYPE_DEPTHWISE_CONV;
extern const std::string G_OP_TYPE_IM2SEQUENCE; extern const char *G_OP_TYPE_IM2SEQUENCE;
extern const std::string G_OP_TYPE_DROPOUT; extern const char *G_OP_TYPE_DROPOUT;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN; extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
extern const std::string G_OP_TYPE_FUSION_POOL_BN; extern const char *G_OP_TYPE_FUSION_POOL_BN;
extern const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
extern const std::string G_OP_TYPE_FUSION_FC_RELU; extern const char *G_OP_TYPE_FUSION_FC_RELU;
extern const std::string G_OP_TYPE_REGION; extern const char *G_OP_TYPE_REGION;
extern std::unordered_map< extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
...@@ -29,15 +29,15 @@ limitations under the License. */ ...@@ -29,15 +29,15 @@ limitations under the License. */
#include "fpga/api/fpga_api.h" #include "fpga/api/fpga_api.h"
namespace paddle { namespace paddle_mobile {
namespace mobile {
namespace fpga { namespace fpga {
namespace api {
static int fd = -1; static int fd = -1;
static const char *device_path = "/dev/fpgadrv0"; static const char *device_path = "/dev/fpgadrv0";
static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); } static inline int do_ioctl(int req, void *arg) {
return ioctl(req, (long unsigned int)arg);
}
int open_device() { int open_device() {
if (fd == -1) { if (fd == -1) {
...@@ -48,8 +48,8 @@ int open_device() { ...@@ -48,8 +48,8 @@ int open_device() {
// memory management; // memory management;
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE, return reinterpret_cast<void *>(
MAP_SHARED, fd, 0); mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
} }
void fpga_free(void *ptr) { munmap(ptr, 0); } void fpga_free(void *ptr) { munmap(ptr, 0); }
...@@ -58,7 +58,9 @@ void fpga_copy(void *dest, const void *src, size_t num) { ...@@ -58,7 +58,9 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num); memcpy(dest, src, num);
} }
} // namespace api int ComputeFpgaConv(struct FpgaConvArgs args) {}
int ComputeFpgaPool(struct FpgaPoolArgs args) {}
int ComputeFpgaEWAdd(struct FpgaEWAddArgs args) {}
} // namespace fpga } // namespace fpga
} // namespace mobile } // namespace paddle_mobile
} // namespace paddle
...@@ -14,44 +14,107 @@ limitations under the License. */ ...@@ -14,44 +14,107 @@ limitations under the License. */
#pragma once #pragma once
#include <stdint.h>
#include <cstddef> #include <cstddef>
#include <iostream> #include <iostream>
#include <limits> #include <limits>
// memory management; // memory management;
namespace paddle { namespace paddle_mobile {
namespace mobile {
namespace fpga { namespace fpga {
namespace api {
int open_device(); int open_device();
int close_device(); int close_device();
void *fpga_malloc(size_t size); void* fpga_malloc(size_t size);
void fpga_free(void *ptr); void fpga_free(void* ptr);
void fpga_copy(void *dst, const void *src, size_t num); void fpga_copy(void* dst, const void* src, size_t num);
struct CnnVersionArgs { struct FpgaVersionArgs {
void *buf; void* buf;
}; };
struct QuantArgs { struct MemoryToPhysicalArgs {
const void* src;
uint64_t physical;
};
struct MemoryCopyArgs {
void* src;
void* dst;
size_t size;
};
struct FpgaQuantArgs {
float scale; float scale;
}; };
struct BatchNormalizationArgs { struct FpgaBNArgs {
bool enable; bool enabled = false;
void* bias_addr;
void* scale_addr;
};
struct FpgaKernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_h;
uint32_t stride_w;
};
struct FpgaImageArgs {
uint32_t width;
uint32_t height;
uint32_t channels;
uint32_t pad_h;
uint32_t pad_w;
};
struct FpgaConvArgs {
bool relu_enabled;
struct FpgaBNArgs BNargs;
void* image_addr;
void* filter_addr;
void* bias_addr;
void* output_addr;
float quant_scale;
struct FpgaImageArgs image;
uint32_t filter_num;
uint32_t group_num;
struct FpgaKernelArgs kernel;
};
struct FpgaPoolArgs {
void* image_addr;
void* output_addr;
struct FpgaImageArgs image;
struct FpgaKernelArgs kernel;
};
struct FpgaEWAddArgs {
bool relu_enabled;
void* image0_addr;
void* image1_addr;
void* result_addr;
uint32_t const0;
uint32_t const1;
uint32_t data_len; // aligned element count
}; };
struct ScaleArgs {}; int ComputeFpgaConv(struct FpgaConvArgs args);
int ComputeFpgaPool(struct FpgaPoolArgs args);
int ComputeFpgaEWAdd(struct FpgaEWAddArgs args);
#define IOCTL_CNN_MAGIC 'CNN' #define IOCTL_FPGA_MAGIC 'CNN'
#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs) #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs) #define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs) #define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
} // namespace api
} // namespace fpga } // namespace fpga
} // namespace mobile } // namespace paddle_mobile
} // namespace paddle
...@@ -14,11 +14,13 @@ limitations under the License. */ ...@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <map> #include <map>
#include <string> #include <string>
#include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "node.h" #include "framework/program/program-optimize/node.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
......
...@@ -16,14 +16,15 @@ limitations under the License. */ ...@@ -16,14 +16,15 @@ limitations under the License. */
#include <cstdint> #include <cstdint>
#include <cstring> #include <cstring>
#include <fstream>
#include <memory> #include <memory>
#include <string>
#include <type_traits> #include <type_traits>
#include <typeindex> #include <typeindex>
#include <vector> #include <vector>
#include "common/enforce.h"
#include <fstream>
#include "common/enforce.h" #include "common/enforce.h"
#include "common/types.h"
#include "framework/data_layout.h" #include "framework/data_layout.h"
#include "framework/ddim.h" #include "framework/ddim.h"
#include "memory/t_malloc.h" #include "memory/t_malloc.h"
...@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> { ...@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
}; };
static inline size_t SizeOfType(std::type_index type) { static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor; SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
functor;
size_t size = functor(type); size_t size = functor(type);
PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
......
...@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc, ...@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
memcpy(&max_value, *data + sizeof(float), sizeof(float)); memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float); *data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0; const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = (uint8_t *)(*data); uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
for (int k = 0; k < memory_size; ++k) { for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value; static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
} }
...@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
} }
template class Executor<CPU, Precision::FP32>; template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>; template class Executor<GPU_MALI, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -27,17 +27,17 @@ namespace memory { ...@@ -27,17 +27,17 @@ namespace memory {
const int MALLOC_ALIGN = 64; const int MALLOC_ALIGN = 64;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
namespace api = paddle::mobile::fpga::api; namespace fpga = paddle_mobile::fpga;
void Copy(void *dst, const void *src, size_t num) { void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
void *Alloc(size_t size) { return api::malloc(size); } void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
void Free(void *ptr) { void Free(void *ptr) {
if (ptr) { if (ptr) {
api::fpga_free(ptr); fpga::fpga_free(ptr);
} }
} }
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#ifdef CONCAT_OP #ifdef CONCAT_OP
#include "concat_op.h" #include <vector>
#include "operators/concat_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp); ...@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp); REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
#endif #endif
#endif #endif
...@@ -53,6 +53,7 @@ USE_OP_CPU(concat); ...@@ -53,6 +53,7 @@ USE_OP_CPU(concat);
USE_OP_MALI_GPU(concat); USE_OP_MALI_GPU(concat);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(concat);
#endif #endif
#endif #endif
...@@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope), scope),
param_(inputs, outputs, attrs, *scope) {} param_(inputs, outputs, attrs, scope.get()) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() {} void Init() {}
......
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#ifdef DROPOUT_OP #ifdef DROPOUT_OP
#pragma once
#include "operators/kernel/dropout_kernel.h" #include "operators/kernel/dropout_kernel.h"
#include <operators/math/transform.h> #include <operators/math/transform.h>
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once; #pragma once
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConcatKernel<FPGA, half>::Init(ConcatParam *param) {
return true;
}
template <>
void ConcatKernel<FPGA, half>::Compute(const ConcatParam &param) const {
auto inputs = param.Inputs();
auto *out = param.Out();
int64_t axis = param.Axis();
out->mutable_data<half>();
DDim out_dim = out->dims();
int pixels = out_dim[1] * out_dim[2];
auto out_channel = out_dim[3];
auto out_offset = 0;
for (int i = 0; i < inputs.size(); ++i) {
auto input = inputs[i];
auto channels = input[3];
out_offset += channels;
auto src = input->data<half>();
for (int j = 0; j < pixels; ++j) {
auto dst = out->data<half>() + out_offset;
memory::Copy(dst, src, sizeof(half));
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -20,13 +20,11 @@ limitations under the License. */ ...@@ -20,13 +20,11 @@ limitations under the License. */
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once; #pragma once
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using namespace framework;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class Im2SequenceKernel class Im2SequenceKernel
: public framework::OpKernelBase<DeviceType, Im2SequenceParam> { : public framework::OpKernelBase<DeviceType, Im2SequenceParam> {
......
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#ifdef FUSION_FC_OP #ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h" #include "operators/kernel/fusion_fc_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once; #pragma once
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once; #pragma once
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once; #pragma once
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/ */
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const float *a0, *a1, *a2, *a3; const float *a0, *a1, *a2, *a3;
for (int i = 0; i < m - m_tail; i += MR) { for (int i = 0; i < m - m_tail; i += MR) {
a0 = A + i * lda; a0 = A + i * lda;
...@@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, ...@@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) {
const float *a0, *a1, *a2, *a3, *a4, *a5;
for (int i = 0; i < m - m_tail; i += MR) {
a0 = A + i * lda;
a1 = A + (i + 1) * lda;
a2 = A + (i + 2) * lda;
a3 = A + (i + 3) * lda;
a4 = A + (i + 4) * lda;
a5 = A + (i + 5) * lda;
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
*buffer++ = *a1++;
*buffer++ = *a2++;
*buffer++ = *a3++;
*buffer++ = *a4++;
*buffer++ = *a5++;
}
}
int i = m - m_tail;
a0 = &A(i, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
a4 = a0 + 4 * lda;
a5 = a0 + 5 * lda;
if (m_tail != 0) {
if (m_tail <= 5) {
a5 = zero;
}
if (m_tail <= 4) {
a4 = zero;
}
if (m_tail <= 3) {
a3 = zero;
}
if (m_tail <= 2) {
a2 = zero;
}
if (m_tail <= 1) {
a1 = zero;
}
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
*buffer++ = *a1++;
*buffer++ = *a2++;
*buffer++ = *a3++;
*buffer++ = *a4++;
*buffer++ = *a5++;
}
}
}
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const float *b0; const float *b0;
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
for (int i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
...@@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, ...@@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
} }
} }
...@@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, ...@@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
} }
} }
...@@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu) { const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 32 * 1024;
int L2 = 1 * 1024 * 1024; int L2 = 0.5 * 1024 * 1024;
KC = k; KC = k;
MC = L2 / (2 * KC * sizeof(float)); MC = L1 / (KC * sizeof(float));
NC = MC; NC = L2 / (KC * sizeof(float));
// make sure MC is multiple of 4, and NC is multiple of 8 // make sure MC is multiple of MR, and NC is multiple of NR
int mblock_num = (m + MC - 1) / MC; int mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num; MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + 4 - 1) / 4 * 4; MC = (MC + MR - 1) / MR * MR;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int nblock_num = (n + NC - 1) / NC; int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num; NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + 8 - 1) / 8 * 8; NC = (NC + NR - 1) / NR * NR;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA = static_cast<float *>( packedA = static_cast<float *>(
...@@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
int mc, nc; int mc, nc;
for (int j = 0; j < n; j += NC) { for (int j = 0; j < n; j += NC) {
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
for (int i = 0; i < m; i += MC) { for (int i = 0; i < m; i += MC) {
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
relu); relu);
} }
...@@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
bool relu, float *new_scale, float *new_bias) { bool relu, float *new_scale, float *new_bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 32 * 1024;
int L2 = 1 * 1024 * 1024; int L2 = 0.5 * 1024 * 1024;
KC = k; KC = k;
MC = L2 / (2 * KC * sizeof(float)); MC = L1 / (KC * sizeof(float));
NC = MC; NC = L2 / (KC * sizeof(float));
// make sure MC is multiple of 4, and NC is multiple of 8 // make sure MC is multiple of MR, and NC is multiple of NR
int mblock_num = (m + MC - 1) / MC; int mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num; MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + 4 - 1) / 4 * 4; MC = (MC + MR - 1) / MR * MR;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int nblock_num = (n + NC - 1) / NC; int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num; NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + 8 - 1) / 8 * 8; NC = (NC + NR - 1) / NR * NR;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA = static_cast<float *>( packedA = static_cast<float *>(
...@@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
int mc, nc; int mc, nc;
for (int j = 0; j < n; j += NC) { for (int j = 0; j < n; j += NC) {
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
for (int i = 0; i < m; i += MC) { for (int i = 0; i < m; i += MC) {
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
&C(i, j), ldc, relu, new_scale + i, new_bias + i); &C(i, j), ldc, relu, new_scale + i, new_bias + i);
} }
...@@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
#if __ARM_NEON
#if __aarch64__
// init C
float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0);
float32x4_t cv2 = vdupq_n_f32(0.0);
float32x4_t cv3 = vdupq_n_f32(0.0);
float32x4_t cv4 = vdupq_n_f32(0.0);
float32x4_t cv5 = vdupq_n_f32(0.0);
float32x4_t cv6 = vdupq_n_f32(0.0);
float32x4_t cv7 = vdupq_n_f32(0.0);
float32x4_t cv8 = vdupq_n_f32(0.0);
float32x4_t cv9 = vdupq_n_f32(0.0);
float32x4_t cv10 = vdupq_n_f32(0.0);
float32x4_t cv11 = vdupq_n_f32(0.0);
float32x4_t av;
float32x4_t bv0;
float32x4_t bv1;
float32x2_t av01;
float32x2_t av23;
float32x2_t av45;
for (int p = 0; p < k; p += 1) {
av = vld1q_f32(a);
av01 = vget_low_f32(av);
av23 = vget_high_f32(av);
av45 = vld1_f32(a + 4);
bv0 = vld1q_f32(b);
bv1 = vld1q_f32(b + 4);
cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
a += MR;
b += NR;
}
vst1q_f32(c, cv0);
vst1q_f32(c + 4, cv1);
vst1q_f32(c + ldc, cv2);
vst1q_f32(c + ldc + 4, cv3);
vst1q_f32(c + 2 * ldc, cv4);
vst1q_f32(c + 2 * ldc + 4, cv5);
vst1q_f32(c + 3 * ldc, cv6);
vst1q_f32(c + 3 * ldc + 4, cv7);
vst1q_f32(c + 4 * ldc, cv8);
vst1q_f32(c + 4 * ldc + 4, cv9);
vst1q_f32(c + 5 * ldc, cv10);
vst1q_f32(c + 5 * ldc + 4, cv11);
#else
const float *a_ptr, *b_ptr;
a_ptr = a;
b_ptr = b;
int kc1 = k / 4;
int kc2 = k % 4;
int step = 4 * ldc;
asm volatile(
"pld [%[a_ptr]] \n\t"
"pld [%[b_ptr]] \n\t"
"pld [%[a_ptr], #64] \n\t"
"pld [%[b_ptr], #64] \n\t"
"vmov.f32 q4, #0.0 \n\t"
"vmov.f32 q5, #0.0 \n\t"
"vmov.f32 q6, #0.0 \n\t"
"vmov.f32 q7, #0.0 \n\t"
"vmov.f32 q8, #0.0 \n\t"
"vmov.f32 q9, #0.0 \n\t"
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"vmov.f32 q14, #0.0 \n\t"
"vmov.f32 q15, #0.0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
// "pld [%[a_ptr], #128] \n\t"
// "pld [%[b_ptr], #128] \n\t"
// "pld [%[a_ptr], #192] \n\t"
// "pld [%[b_ptr], #192] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q4, q2, d0[0] \n\t"
"vmla.f32 q5, q3, d0[0] \n\t"
"vmla.f32 q6, q2, d0[1] \n\t"
"vmla.f32 q7, q3, d0[1] \n\t"
"vmla.f32 q8, q2, d1[0] \n\t"
"vmla.f32 q9, q3, d1[0] \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q2, d2[0] \n\t"
"vmla.f32 q13, q3, d2[0] \n\t"
"vmla.f32 q14, q2, d2[1] \n\t"
"vmla.f32 q15, q3, d2[1] \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t"
"mov r5, %[c] \n\t"
"mov r6, %[step] \n\t"
"vst1.32 {q4, q5}, [r5], r6 \n\t"
"vst1.32 {q6, q7}, [r5], r6 \n\t"
"vst1.32 {q8, q9}, [r5], r6 \n\t"
"vst1.32 {q10, q11}, [r5], r6 \n\t"
"vst1.32 {q12, q13}, [r5], r6 \n\t"
"vst1.32 {q14, q15}, [r5] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc2] "r"(kc2), [step] "r"(step)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else
#endif // __ARM_NEON
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#define B(i, j) B[(i)*ldb + (j)] #define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)] #define C(i, j) C[(i)*ldc + (j)]
#define MR 4 #define MR 6
#define NR 8 #define NR 8
#define s_min(i, j) ((i) < (j) ? (i) : (j)) #define s_min(i, j) ((i) < (j) ? (i) : (j))
...@@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/ */
// 将 A 矩阵分块复制到连续内存(RowMajor) // 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor) // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
...@@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, ...@@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
// 计算一个更小的 C 矩阵分块 // 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写 // 分块矩阵乘法结果回写
// C = A * B // C = A * B
......
...@@ -22,6 +22,9 @@ limitations under the License. */ ...@@ -22,6 +22,9 @@ limitations under the License. */
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "framework/variable.h" #include "framework/variable.h"
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/api/fpga_api.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam { ...@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam {
Tensor *input_y_; Tensor *input_y_;
Tensor *out_; Tensor *out_;
int axis_; int axis_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaEWAddArgs fpga_EW_add_args;
public:
const fpga::FpgaEWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
void SetFpgaArgs(const fpga::FpgaEWAddArgs &args) { fpga_EW_add_args = args; }
#endif
}; };
#ifdef FUSION_ELEMENTWISEADDRELU_OP #ifdef FUSION_ELEMENTWISEADDRELU_OP
...@@ -450,80 +462,15 @@ class PoolParam : public OpParam { ...@@ -450,80 +462,15 @@ class PoolParam : public OpParam {
vector<int> paddings_; vector<int> paddings_;
bool ceil_mode_; bool ceil_mode_;
bool global_pooling_ = false; bool global_pooling_ = false;
}; #ifdef PADDLE_MOBILE_FPGA
#endif
#ifdef FUSION_POOLBN_OP
class FusionPoolBNParam : OpParam {
public:
FusionPoolBNParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope) {
input_ = InputXFrom<LoDTensor>(inputs, scope);
pooling_type_ = GetAttr<string>("pooling_type", attrs);
ksize_ = GetAttr<vector<int>>("ksize", attrs);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
global_pooling_ = GetAttr<bool>("global_pooling", attrs);
output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *Input() const { return input_; }
const string &PoolingType() const { return pooling_type_; }
const vector<int> &Ksize() const { return ksize_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
bool isCeilMode() const { return ceil_mode_; }
bool isGlobalPooling() const { return global_pooling_; }
Tensor *OutputY() const { return output_y_; }
const Tensor *InputBias() const { return input_bias_; }
const Tensor *InputMean() const { return input_mean_; }
const Tensor *InputScale() const { return input_scale_; }
const Tensor *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
const string &DataFormat() const { return data_format_; }
private: private:
Tensor *input_; fpga::FpgaPoolArgs fpga_pool_args;
string pooling_type_;
vector<int> ksize_; public:
vector<int> strides_; const fpga::FpgaPoolArgs &FpgaArgs() const { return fpga_pool_args; }
vector<int> paddings_; void SetFpgaArgs(const fpga::FpgaPoolArgs &args) { fpga_pool_args = args; }
bool ceil_mode_; #endif
bool global_pooling_ = false;
Tensor *output_y_;
Tensor *input_bias_;
Tensor *input_mean_;
Tensor *input_scale_;
Tensor *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
string data_format_;
}; };
#endif #endif
...@@ -704,10 +651,10 @@ class MultiClassNMSParam : public OpParam { ...@@ -704,10 +651,10 @@ class MultiClassNMSParam : public OpParam {
class FeedParam : public OpParam { class FeedParam : public OpParam {
public: public:
FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, Scope &scope) { const AttributeMap &attrs, Scope *scope) {
input_x_ = InputXFrom<LoDTensor>(inputs, scope); input_x_ = InputXFrom<LoDTensor>(inputs, *scope);
out_ = OutFrom<LoDTensor>(outputs, scope); out_ = OutFrom<LoDTensor>(outputs, *scope);
auto var = scope.Var("batch_size"); auto var = scope->Var("batch_size");
batch_size = var->GetValue<int>(); batch_size = var->GetValue<int>();
} }
const Tensor *InputX() const { return input_x_; } const Tensor *InputX() const { return input_x_; }
...@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam { ...@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam {
int x_num_col_dims_; int x_num_col_dims_;
int y_num_col_dims_; int y_num_col_dims_;
int axis_; int axis_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#ifdef FUSION_FCRELU_OP #ifdef FUSION_FCRELU_OP
...@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam { ...@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam {
vector<int> paddings_; vector<int> paddings_;
vector<int> dilations_; vector<int> dilations_;
int groups; int groups;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param); Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
...@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam { ...@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam {
bool is_test_; bool is_test_;
Tensor *new_bias_; Tensor *new_bias_;
Tensor *new_scale_; Tensor *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam { ...@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam {
bool is_test_; bool is_test_;
Tensor *new_bias_; Tensor *new_bias_;
Tensor *new_scale_; Tensor *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam { ...@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam {
}; };
#endif #endif
#ifdef REGION_OP
class RegionParam : public OpParam {};
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -33,7 +33,7 @@ class ResizeOp ...@@ -33,7 +33,7 @@ class ResizeOp
DeviceType, ResizeParam, operators::ResizeKernel<DeviceType, T>> { DeviceType, ResizeParam, operators::ResizeKernel<DeviceType, T>> {
public: public:
ResizeOp(const std::string &type, const VariableNameMap &inputs, ResizeOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs, const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, ResizeParam, : framework::OperatorWithKernel<DeviceType, ResizeParam,
operators::ResizeKernel<DeviceType, T>>( operators::ResizeKernel<DeviceType, T>>(
......
...@@ -145,6 +145,10 @@ else () ...@@ -145,6 +145,10 @@ else ()
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile) target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif() endif()
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <iostream> #include <iostream>
#include "../test_helper.h" #include "../test_helper.h"
#include "common/log.h" #include "common/log.h"
...@@ -20,13 +22,21 @@ limitations under the License. */ ...@@ -20,13 +22,21 @@ limitations under the License. */
#define a(i, j) a[(i)*lda + (j)] #define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)] #define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)] #define c1(i, j) c1[(i)*ldc + (j)]
#define m 62 void print_matirx(int m, int n, int ldc, float *c) {
#define n 63 for (int i = 0; i < m; ++i) {
#define k 74 std::cout << c(i, 0);
for (int j = 1; j < n; ++j) {
std::cout << " | " << c(i, j);
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main() { int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
int lda = k; int lda = k;
int ldb = n; int ldb = n;
int ldc = n; int ldc = n;
...@@ -39,44 +49,88 @@ int main() { ...@@ -39,44 +49,88 @@ int main() {
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
float *c1 = float *c1 =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
float *scale =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
float *bias =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
srand(unsigned(time(0)));
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
a[i] = 2; a[i] = t1 + rand() % t2;
} }
for (int i = 0; i < k * n; ++i) { for (int i = 0; i < k * n; ++i) {
b[i] = 2; b[i] = t1 + rand() % t2;
} }
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m; ++i) {
c[i] = 2; scale[i] = t1 + rand() % t2;
c1[i] = 2;
} }
for (int i = 0; i < m; ++i) {
auto time1 = time(); bias[i] = t1 + rand() % t2;
// paddle_mobile::operators::math::Sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3,
// c,
// ldc);
auto time2 = time();
DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
for (int i = 0; i < m * n; ++i) {
std::cout << c[i] << " | ";
if (i % n == (n - 1)) {
std::cout << std::endl;
}
} }
for (int j = 0; j < n; ++j) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
c1(i, j) *= 0.3; for (int j = 0; j < n; ++j) {
for (int p = 0; p < k; ++p) { float r = 0;
c1(i, j) += 0.9 * a(i, p) * b(p, j); for (int p = 0; p < k; p++) {
r += a(i, p) * b(p, j);
} }
r *= scale[i];
r += bias[i];
if (relu && (r < 0)) {
r = 0;
}
c1(i, j) = r;
} }
} }
std::cout << "正确结果对比:" << std::endl;
paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3,
c, ldc, relu, scale, bias);
int eq = 0;
int neq = 0;
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
std::cout << c1[i] << " | "; if (static_cast<int>(c[i]) == static_cast<int>(c1[i])) {
if (i % n == (n - 1)) { ++eq;
std::cout << std::endl; } else {
++neq;
} }
} }
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
paddle_mobile::memory::Free(c1);
paddle_mobile::memory::Free(scale);
paddle_mobile::memory::Free(bias);
return 0;
}
int main() {
do_sgemm(9, 9, 9, true, 10, 10, 10);
do_sgemm(10, 6, 12, false, 10, 10, 0);
do_sgemm(512, 256, 384, false, 10, 10, 0);
do_sgemm(1366, 768, 256, false, 10, 10, 0);
do_sgemm(1255, 755, 333, false, 10, 10, 0);
do_sgemm(555, 777, 999, false, 10, 10, 0);
do_sgemm(10, 6, 12, true, -4, 10, 0);
do_sgemm(512, 256, 384, true, -4, 10, 0);
do_sgemm(1366, 768, 256, true, -4, 10, 0);
do_sgemm(1255, 755, 333, true, -4, 10, 0);
do_sgemm(555, 777, 999, true, -4, 10, 0);
return 0; return 0;
} }
...@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType> ...@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
class Executor4Test : public Executor<DeviceType> { class Executor4Test : public Executor<DeviceType> {
public: public:
Executor4Test(Program<DeviceType> p, string op_type, Executor4Test(Program<DeviceType> p, string op_type,
bool use_optimize = false) bool use_optimize = false, int predict_op_count = 1)
: Executor<DeviceType>() { : Executor<DeviceType>() {
this->use_optimize_ = use_optimize; this->use_optimize_ = use_optimize;
this->program_ = p; this->program_ = p;
...@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
LOG(paddle_mobile::LogLevel::kLOG_ERROR) LOG(paddle_mobile::LogLevel::kLOG_ERROR)
<< "to_predict_program_ == nullptr"; << "to_predict_program_ == nullptr";
} }
const std::vector<std::shared_ptr<BlockDesc>> blocks = const std::vector<std::shared_ptr<BlockDesc>> blocks =
this->to_predict_program_->Blocks(); this->to_predict_program_->Blocks();
for (std::shared_ptr<BlockDesc> block_desc : blocks) { for (std::shared_ptr<BlockDesc> block_desc : blocks) {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (std::shared_ptr<OpDesc> op : ops) { for (int i = 0; i < ops.size(); ++i) {
if (op->Type() == op_type) { auto op = ops[i];
if (op->Type() == op_type && i < predict_op_count) {
DLOG << "匹配到: " << op->Type(); DLOG << "匹配到: " << op->Type();
/// test first meeting op in program /// test first meeting op in program
...@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
op->Type(), op->GetInputs(), op->GetOutputs(), op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), this->program_.scope); op->GetAttrMap(), this->program_.scope);
this->ops_of_block_[*block_desc.get()].push_back(op_ptr); this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
break;
} }
} }
} }
this->InitMemory(); this->InitMemory();
std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
this->to_predict_program_->Block(0);
auto &ops = this->ops_of_block_[*to_predict_block.get()];
for (const auto &op : ops) {
op->Init();
}
} }
template <typename T = LoDTensor> template <typename T = LoDTensor>
...@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
auto *output_tensor = con_output->GetMutable<LoDTensor>(); auto *output_tensor = con_output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>(dDim); output_tensor->mutable_data<float>(dDim);
std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
out_tensor.reset(output_tensor);
std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block = std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
this->to_predict_program_->Block(0); this->to_predict_program_->Block(0);
for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size(); for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
...@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
op->Run(); op->Run();
} }
return out_tensor; return std::make_shared<paddle_mobile::framework::Tensor>(
paddle_mobile::framework::Tensor(*output_tensor));
} }
}; };
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string>
#include "../test_helper.h" #include "../test_helper.h"
#include "io/loader.h" #include "io/loader.h"
...@@ -20,12 +22,10 @@ int main() { ...@@ -20,12 +22,10 @@ int main() {
// ../../../test/models/googlenet // ../../../test/models/googlenet
// ../../../test/models/mobilenet // ../../../test/models/mobilenet
// auto program = loader.Load(g_googlenet, true); // auto program = loader.Load(g_googlenet, true);
// auto program = loader.Load(g_mobilenet_ssd, true);
auto program = loader.Load(g_mobilenet_ssd, true); auto program = loader.Load(std::string(g_ocr) + "/model",
// auto program = loader.Load(g_googlenet_combine + "/model", std::string(g_ocr) + "/params", false);
// g_googlenet_combine +
// "/params", true);
// program.originProgram->Description("program desc: "); // program.originProgram->Description("program desc: ");
return 0; return 0;
} }
...@@ -20,22 +20,20 @@ int main() { ...@@ -20,22 +20,20 @@ int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4); paddle_mobile.SetThreadNum(4);
auto time1 = time(); auto time1 = time();
auto isok = paddle_mobile.Load(g_mobilenet_ssd_gesture + "/model", auto isok = paddle_mobile.Load(
g_mobilenet_ssd_gesture + "/params", true); std::string(g_mobilenet_ssd_gesture) + "/model",
std::string(g_mobilenet_ssd_gesture) + "/params", true);
// auto isok = paddle_mobile.Load(g_mobilenet_ssd, false); // auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
if (isok) { if (isok) {
auto time2 = time(); auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 300, 300}; std::vector<int64_t> dims{1, 3, 300, 300};
Tensor input_tensor; GetInput<float>(g_hand, &input, dims);
SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
static_cast<float>(1));
std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
auto time3 = time(); auto time3 = time();
paddle_mobile.Predict(input, dims); auto output = paddle_mobile.Predict(input, dims);
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) << "ms" std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl; << std::endl;
......
...@@ -24,19 +24,21 @@ int main() { ...@@ -24,19 +24,21 @@ int main() {
auto time2 = time(); auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor; GetInput<float>(g_test_image_1x3x224x224, &input, dims);
SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
static_cast<float>(1)); for (int i = 0; i < 10; ++i) {
auto time3 = time();
std::vector<float> input(input_tensor.data<float>(), auto vec_result = paddle_mobile.Predict(input, dims);
input_tensor.data<float>() + input_tensor.numel()); auto time4 = time();
auto time3 = time(); std::vector<float>::iterator biggest =
auto vec_result = paddle_mobile.Predict(input, dims); std::max_element(std::begin(vec_result), std::end(vec_result));
auto time4 = time(); std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
std::cout << "predict cost :" << time_diff(time3, time4) << "ms" std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl; << std::endl;
}
} }
return 0; return 0;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
#include "operators/fusion_conv_add_bn_relu_op.h"
int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../models/image_classification_resnet.inference.model
auto program = loader.Load(g_mobilenet, true);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::FusionConvAddBNReluOp<
paddle_mobile::CPU, float>>
executor(program, "fusion_conv_add_bn_relu", true);
std::cout << "executor 4 test: " << std::endl;
paddle_mobile::framework::Tensor input;
GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
// // use SetupTensor if not has local input image .
// SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
// static_cast<float>(1));
DLOG << " fuck: " << input;
auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
std::cout << "before predict: " << std::endl;
auto output =
executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
std::cout << "after predict " << std::endl;
auto output_ptr = output->data<float>();
int stride = output->numel() / 100;
for (int i = 0; i < 100; i++) {
DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
}
// for (int i = 0; i < 100; i++) {
// DLOG << " index:" << i << " value: "<< output_ptr[i];
// }
// for (int j = 0; j < output->numel(); ++j) {
// std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
// }
std::cout << std::endl;
return 0;
}
...@@ -24,18 +24,22 @@ limitations under the License. */ ...@@ -24,18 +24,22 @@ limitations under the License. */
#include "framework/ddim.h" #include "framework/ddim.h"
#include "framework/tensor.h" #include "framework/tensor.h"
static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd"; static const char *g_ocr = "../models/ocr";
static const std::string g_mobilenet_ssd_gesture = static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
"../models/mobilenet+ssd_gesture"; static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
static const std::string g_squeezenet = "../models/squeezenet"; static const char *g_squeezenet = "../models/squeezenet";
static const std::string g_googlenet = "../models/googlenet"; static const char *g_googlenet = "../models/googlenet";
static const std::string g_mobilenet = "../models/mobilenet"; static const char *g_mobilenet = "../models/mobilenet";
static const std::string g_resnet_50 = "../models/resnet_50"; static const char *g_resnet_50 = "../models/resnet_50";
static const std::string g_resnet = "../models/resnet"; static const char *g_resnet = "../models/resnet";
static const std::string g_googlenet_combine = "../models/googlenet_combine"; static const char *g_googlenet_combine = "../models/googlenet_combine";
static const std::string g_yolo = "../models/yolo"; static const char *g_yolo = "../models/yolo";
static const std::string g_test_image_1x3x224x224 = static const char *g_test_image_1x3x224x224 =
"../images/test_image_1x3x224x224_float"; "../images/test_image_1x3x224x224_float";
static const char *g_test_image_1x3x224x224_banana =
"../images/input_3x224x224_banana";
static const char *g_hand = "../images/hand_image";
using paddle_mobile::framework::DDim; using paddle_mobile::framework::DDim;
using paddle_mobile::framework::Tensor; using paddle_mobile::framework::Tensor;
......
...@@ -65,6 +65,8 @@ endif() ...@@ -65,6 +65,8 @@ endif()
file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK) file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
# Android NDK revision # Android NDK revision
message("${ANDROID_NDK}")
file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES) file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
"^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.") "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
...@@ -159,7 +161,7 @@ endif() ...@@ -159,7 +161,7 @@ endif()
# Default values for configurable variables. # Default values for configurable variables.
if(NOT ANDROID_TOOLCHAIN) if(NOT ANDROID_TOOLCHAIN)
set(ANDROID_TOOLCHAIN clang) set(ANDROID_TOOLCHAIN gcc)
endif() endif()
if(NOT ANDROID_ABI) if(NOT ANDROID_ABI)
set(ANDROID_ABI armeabi-v7a) set(ANDROID_ABI armeabi-v7a)
......
...@@ -63,7 +63,7 @@ build_for_android() { ...@@ -63,7 +63,7 @@ build_for_android() {
TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE="arm" ANDROID_ARM_MODE="arm"
if [ "${#NETS}" > 1 ]; then if [ "${#NETS}" -gt 1 ]; then
cmake .. \ cmake .. \
-B"../build/release/${PLATFORM}" \ -B"../build/release/${PLATFORM}" \
-DANDROID_ABI="${ABI}" \ -DANDROID_ABI="${ABI}" \
...@@ -99,7 +99,7 @@ build_for_ios() { ...@@ -99,7 +99,7 @@ build_for_ios() {
BUILD_DIR=../build/release/"${PLATFORM}"/ BUILD_DIR=../build/release/"${PLATFORM}"/
TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
mkdir -p "${BUILD_DIR}" mkdir -p "${BUILD_DIR}"
if [ "${#NETS}" > 1 ]; then if [ "${#NETS}" -gt 1 ]; then
cmake .. \ cmake .. \
-B"${BUILD_DIR}" \ -B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \ -DCMAKE_BUILD_TYPE="${MODE}" \
......
...@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET) ...@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET)
set(FUSION_CONVADDRELU_OP ON) set(FUSION_CONVADDRELU_OP ON)
set(FUSION_CONVADDBNRELU_OP ON) set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADDBN_OP ON) set(FUSION_CONVADDBN_OP ON)
set(FUSION_POOLBN_OP ON)
set(FUSION_ELEMENTWISEADDRELU_OP ON) set(FUSION_ELEMENTWISEADDRELU_OP ON)
set(FUSION_FC_OP ON) set(FUSION_FC_OP ON)
set(FUSION_FCRELU_OP ON) set(FUSION_FCRELU_OP ON)
set(REGION_OP ON)
set(POOL_OP ON) set(POOL_OP ON)
set(CONCAT_OP ON) set(CONCAT_OP ON)
set(SOFTMAX_OP ON) set(SOFTMAX_OP ON)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册