提交 f14da1e1 编写于 作者: qnqinan's avatar qnqinan

Merge remote-tracking branch 'origin/develop' into develop

......@@ -20,7 +20,9 @@ limitations under the License. */
#include <vector>
namespace paddle_mobile {
enum class Precision : int { FP32 = 0 };
enum class Precision : int { FP32 = 0, FP16 = 1 };
typedef int16_t half;
template <Precision p>
struct PrecisionTrait {
......@@ -31,6 +33,10 @@ template <>
struct PrecisionTrait<Precision::FP32> {
typedef float ptype;
};
template <>
struct PrecisionTrait<Precision::FP16> {
typedef half ptype;
};
//! device type
enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
......
......@@ -14,36 +14,35 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
// memory management;
namespace paddle {
namespace mobile {
namespace paddle_mobile {
namespace fpga {
namespace api {
int open_device();
int close_device();
void *fpga_malloc(size_t size);
void fpga_free(void *ptr);
void fpga_copy(void *dst, const void *src, size_t num);
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dst, const void* src, size_t num);
struct FpgaVersionArgs {
void *buf;
void* buf;
};
struct MemoryToPhysicalArgs {
const void *src;
const void* src;
uint64_t physical;
};
struct MemoryCopyArgs {
void *src;
void *dst;
void* src;
void* dst;
size_t size;
};
......@@ -51,38 +50,71 @@ struct FpgaQuantArgs {
float scale;
};
struct FpgaBNArgs {};
struct FpgaBNArgs {
bool enabled = false;
void* bias_addr;
void* scale_addr;
};
struct FpgaKernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_h;
uint32_t stride_w;
};
struct FpgaImageArgs {
uint32_t width;
uint32_t height;
uint32_t channels;
uint32_t pad_h;
uint32_t pad_w;
};
struct FpgaConvArgs {
bool enable_BN = false;
bool enable_Relu = false;
struct FpgaBNParam bn_parm;
bool relu_enabled;
struct FpgaBNArgs BNargs;
void* image_addr;
void* filter_addr;
void* bias_addr;
void* output_addr;
float quant_scale;
struct FpgaImageArgs image;
uint32_t filter_num;
uint32_t group_num;
struct FpgaKernelArgs kernel;
};
struct FpgaPoolArgs {
bool enable_BN = false;
struct FpgaBNParam bn_parm;
void* image_addr;
void* output_addr;
struct FpgaImageArgs image;
struct FpgaKernelArgs kernel;
};
struct FpgaEWAddArgs { // only support X + Y
bool enable_Relu = false;
struct FpgaEWAddArgs {
bool relu_enabled;
void* image0_addr;
void* image1_addr;
void* result_addr;
uint32_t const0;
uint32_t const1;
uint32_t data_len; // aligned element count
};
int ComputeFpgaConv(struct FpgaConvArgs);
int ComputeFpgaPool(struct FpgaPoolArgs);
int ComputeFpgaEWAdd(struct FpgaEWAddArgs);
int ComputeFpgaConv(struct FpgaConvArgs args);
int ComputeFpgaPool(struct FpgaPoolArgs args);
int ComputeFpgaEWAdd(struct FpgaEWAddArgs args);
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_FPGA_MAGIC 'CNN'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_MEM_TOPHY _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryToPhysicalArgs)
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
} // namespace api
} // namespace fpga
} // namespace mobile
} // namespace paddle
} // namespace paddle_mobile
......@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "framework/operator.h"
#include "node.h"
#include "framework/program/program-optimize/node.h"
namespace paddle_mobile {
namespace framework {
......
......@@ -16,14 +16,15 @@ limitations under the License. */
#include <cstdint>
#include <cstring>
#include <fstream>
#include <memory>
#include <string>
#include <type_traits>
#include <typeindex>
#include <vector>
#include "common/enforce.h"
#include <fstream>
#include "common/enforce.h"
#include "common/types.h"
#include "framework/data_layout.h"
#include "framework/ddim.h"
#include "memory/t_malloc.h"
......@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
};
static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
functor;
size_t size = functor(type);
PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
......
......@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = (uint8_t *)(*data);
uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
}
......@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
}
template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
template class Executor<FPGA, Precision::FP16>;
} // namespace paddle_mobile
......@@ -14,7 +14,9 @@ limitations under the License. */
#ifdef CONCAT_OP
#include "concat_op.h"
#include <vector>
#include "operators/concat_op.h"
namespace paddle_mobile {
namespace operators {
......@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
#endif
#endif
......@@ -53,6 +53,7 @@ USE_OP_CPU(concat);
USE_OP_MALI_GPU(concat);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(concat);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConcatKernel<FPGA, half>::Init(ConcatParam *param) {
return true;
}
template <>
void ConcatKernel<FPGA, half>::Compute(const ConcatParam &param) const {
auto inputs = param.Inputs();
auto *out = param.Out();
int64_t axis = param.Axis();
out->mutable_data<half>();
DDim out_dim = out->dims();
int pixels = out_dim[1] * out_dim[2];
auto out_channel = out_dim[3];
auto out_offset = 0;
for (int i = 0; i < inputs.size(); ++i) {
auto input = inputs[i];
auto channels = input[3];
out_offset += channels;
auto src = input->data<half>();
for (int j = 0; j < pixels; ++j) {
auto dst = out->data<half>() + out_offset;
memory::Copy(dst, src, sizeof(half));
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -22,6 +22,9 @@ limitations under the License. */
#include "framework/scope.h"
#include "framework/tensor.h"
#include "framework/variable.h"
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/api/fpga_api.h"
#endif
namespace paddle_mobile {
namespace operators {
......@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam {
Tensor *input_y_;
Tensor *out_;
int axis_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaEWAddArgs fpga_EW_add_args;
public:
const fpga::FpgaEWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
void SetFpgaArgs(const fpga::FpgaEWAddArgs &args) { fpga_EW_add_args = args; }
#endif
};
#ifdef FUSION_ELEMENTWISEADDRELU_OP
......@@ -450,80 +462,15 @@ class PoolParam : public OpParam {
vector<int> paddings_;
bool ceil_mode_;
bool global_pooling_ = false;
};
#endif
#ifdef FUSION_POOLBN_OP
class FusionPoolBNParam : OpParam {
public:
FusionPoolBNParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope) {
input_ = InputXFrom<LoDTensor>(inputs, scope);
pooling_type_ = GetAttr<string>("pooling_type", attrs);
ksize_ = GetAttr<vector<int>>("ksize", attrs);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
global_pooling_ = GetAttr<bool>("global_pooling", attrs);
output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *Input() const { return input_; }
const string &PoolingType() const { return pooling_type_; }
const vector<int> &Ksize() const { return ksize_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
bool isCeilMode() const { return ceil_mode_; }
bool isGlobalPooling() const { return global_pooling_; }
Tensor *OutputY() const { return output_y_; }
const Tensor *InputBias() const { return input_bias_; }
const Tensor *InputMean() const { return input_mean_; }
const Tensor *InputScale() const { return input_scale_; }
const Tensor *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
const string &DataFormat() const { return data_format_; }
#ifdef PADDLE_MOBILE_FPGA
private:
Tensor *input_;
string pooling_type_;
vector<int> ksize_;
vector<int> strides_;
vector<int> paddings_;
bool ceil_mode_;
bool global_pooling_ = false;
Tensor *output_y_;
Tensor *input_bias_;
Tensor *input_mean_;
Tensor *input_scale_;
Tensor *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
string data_format_;
fpga::FpgaPoolArgs fpga_pool_args;
public:
const fpga::FpgaPoolArgs &FpgaArgs() const { return fpga_pool_args; }
void SetFpgaArgs(const fpga::FpgaPoolArgs &args) { fpga_pool_args = args; }
#endif
};
#endif
......@@ -704,7 +651,7 @@ class MultiClassNMSParam : public OpParam {
class FeedParam : public OpParam {
public:
FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, Scope &scope) {
const AttributeMap &attrs, Scope const &scope) {
input_x_ = InputXFrom<LoDTensor>(inputs, scope);
out_ = OutFrom<LoDTensor>(outputs, scope);
auto var = scope.Var("batch_size");
......@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam {
int x_num_col_dims_;
int y_num_col_dims_;
int axis_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
};
#ifdef FUSION_FCRELU_OP
......@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam {
vector<int> paddings_;
vector<int> dilations_;
int groups;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
};
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
......@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam {
bool is_test_;
Tensor *new_bias_;
Tensor *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam {
bool is_test_;
Tensor *new_bias_;
Tensor *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::FpgaConvArgs fpga_conv_args;
public:
const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam {
};
#endif
#ifdef REGION_OP
class RegionParam : public OpParam {};
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -33,7 +33,7 @@ class ResizeOp
DeviceType, ResizeParam, operators::ResizeKernel<DeviceType, T>> {
public:
ResizeOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs,
const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, ResizeParam,
operators::ResizeKernel<DeviceType, T>>(
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <cstdlib>
#include <ctime>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
void print_matirx(int m, int n, int ldc, float *c) {
for (int i = 0; i < m; ++i) {
std::cout << c(i, 0);
for (int j = 1; j < n; ++j) {
std::cout << " | " << c(i, j);
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
int lda = k;
int ldb = n;
int ldc = n;
float *a = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
float *b = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
float *c = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
float *c1 = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
float* scale = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
float* bias = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
srand(unsigned(time(0)));
for (int i = 0; i < m * k; ++i) {
a[i] = t1 + rand() % t2;
}
for (int i = 0; i < k * n; ++i) {
b[i] = t1 + rand() % t2;
}
for (int i = 0; i < m; ++i) {
scale[i] = t1 + rand() % t2;
}
for (int i = 0; i < m; ++i) {
bias[i] = t1 + rand() % t2;
}
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
float r = 0;
for (int p = 0; p < k; p++) {
r += a(i, p) * b(p, j);
}
r *= scale[i];
r += bias[i];
if (relu && (r < 0)) {
r = 0;
}
c1(i, j) = r;
}
}
paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda,
b, ldb, 0.3, c, ldc, relu, scale, bias);
int eq = 0;
int neq = 0;
for (int i = 0; i < m * n; ++i) {
if (static_cast<int>(c[i]) == static_cast<int>(c1[i])) {
++eq;
} else {
++neq;
}
}
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k <<
" relu=" << relu <<
" eq=" << eq << " neq=" << neq << std::endl;
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
paddle_mobile::memory::Free(c1);
paddle_mobile::memory::Free(scale);
paddle_mobile::memory::Free(bias);
return 0;
}
int main() {
do_sgemm(9, 9, 9, true, 10, 10, 10);
do_sgemm(10, 6, 12, false, 10, 10, 0);
do_sgemm(512, 256, 384, false, 10, 10, 0);
do_sgemm(1366, 768, 256, false, 10, 10, 0);
do_sgemm(1255, 755, 333, false, 10, 10, 0);
do_sgemm(555, 777, 999, false, 10, 10, 0);
do_sgemm(10, 6, 12, true, -4, 10, 0);
do_sgemm(512, 256, 384, true, -4, 10, 0);
do_sgemm(1366, 768, 256, true, -4, 10, 0);
do_sgemm(1255, 755, 333, true, -4, 10, 0);
do_sgemm(555, 777, 999, true, -4, 10, 0);
return 0;
}
......@@ -65,6 +65,8 @@ endif()
file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
# Android NDK revision
message("${ANDROID_NDK}")
file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
"^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
......@@ -159,7 +161,7 @@ endif()
# Default values for configurable variables.
if(NOT ANDROID_TOOLCHAIN)
set(ANDROID_TOOLCHAIN clang)
set(ANDROID_TOOLCHAIN gcc)
endif()
if(NOT ANDROID_ABI)
set(ANDROID_ABI armeabi-v7a)
......
......@@ -40,8 +40,8 @@ build_for_android() {
fi
if [ -z "$PLATFORM" ]; then
PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
# PLATFORM="arm-v8a"
# PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
PLATFORM="arm-v8a"
fi
if [ "${PLATFORM}" = "arm-v7a" ]; then
......@@ -63,7 +63,7 @@ build_for_android() {
TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE="arm"
if [ "${#NETS}" > 1 ]; then
if [ "${#NETS}" -gt 1 ]; then
cmake .. \
-B"../build/release/${PLATFORM}" \
-DANDROID_ABI="${ABI}" \
......@@ -99,7 +99,7 @@ build_for_ios() {
BUILD_DIR=../build/release/"${PLATFORM}"/
TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
mkdir -p "${BUILD_DIR}"
if [ "${#NETS}" > 1 ]; then
if [ "${#NETS}" -gt 1 ]; then
cmake .. \
-B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \
......
......@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET)
set(FUSION_CONVADDRELU_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADDBN_OP ON)
set(FUSION_POOLBN_OP ON)
set(FUSION_ELEMENTWISEADDRELU_OP ON)
set(FUSION_FC_OP ON)
set(FUSION_FCRELU_OP ON)
set(REGION_OP ON)
set(POOL_OP ON)
set(CONCAT_OP ON)
set(SOFTMAX_OP ON)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册