提交 fc659486 编写于 作者: T tienfeek

fix fpga quant bug

......@@ -104,10 +104,3 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources
# generated files
lite/api/paddle_use_kernels.h
lite/api/paddle_use_ops.h
lite/backends/arm/math/dotprod/gemm_sdot.h
lite/tools/cmake_tools/ast.pyc
......@@ -22,8 +22,6 @@ if (WITH_PADDLE_MOBILE)
return()
endif(WITH_PADDLE_MOBILE)
# set(CMAKE_BUILD_TYPE DEBUG)
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(CMAKE_CXX_STANDARD 11)
......
./lite/tools/build.sh \
--arm_os=armlinux \
--arm_abi=armv8 \
--arm_lang=gcc \
test
......@@ -41,6 +41,7 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass);
USE_MIR_PASS(type_precision_cast_pass);
USE_MIR_PASS(type_layout_cast_pass);
USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(kernel_place_correct_pass)
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
......@@ -33,7 +33,7 @@ class Debugger {
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
if (op_config[op_type]) {
// tensor->saveToFile(op_type, true);
tensor->saveToFile(op_type, true);
}
}
......@@ -43,6 +43,8 @@ class Debugger {
op_config["concat"] = true;
op_config["pooling"] = true;
op_config["conv"] = true;
op_config["dwconv"] = true;
op_config["ew_add"] = true;
op_config["crop"] = true;
op_config["feed"] = true;
op_config["mul"] = true;
......
......@@ -61,7 +61,9 @@ void reset_device() {
// memory management;
void *fpga_malloc(size_t size) {
#ifdef PADDLE_MOBILE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == MAP_FAILED) {
......
......@@ -656,7 +656,6 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
}
size_t size = params.size();
if (ret == 0 && size > 1) {
// Tensor* output = conv_params.output;
Tensor& img = params[0]->output;
for (int i = 0; i < 1; i++) {
for (int i = 0; i < img.shape().numel(); i++) {
......
......@@ -96,6 +96,7 @@ class DepthwiseConvPE : public PE {
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data,
scale_data,
......
文件模式从 100644 更改为 100755
文件模式从 100755 更改为 100644
......@@ -121,7 +121,6 @@ class GRUPE : public PE {
prev_hidden_.copyFrom(value.pre_output);
}
mul_pe_.dispatch();
// reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update;
reset_gate_data += stride_update;
......@@ -172,7 +171,6 @@ class GRUPE : public PE {
zynqmp::Tensor bias_;
zynqmp::Tensor weight_;
zynqmp::Tensor state_weight_;
zynqmp::Tensor update_gate_;
zynqmp::Tensor reset_gate_;
zynqmp::Tensor cell_state_;
......
......@@ -346,19 +346,9 @@ class Tensor {
if (placeHolder_ == nullptr) {
return;
}
std::cout << scale()[0] << " , " << scale()[1] << std::endl;
}
void printScale(std::string type) {
std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width())
<< std::endl;
std::cout << type << " \n";
printScale();
}
void printScale(std::string type) { printScale(); }
std::string dimsFileName() {
return std::to_string(shape_->num()) + "_" +
......@@ -386,7 +376,6 @@ class Tensor {
static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path;
counter++;
std::cout << "======== saving file:" << npath << " ============\n";
save_file_with_name(npath);
}
......
......@@ -165,9 +165,6 @@ class TensorLite {
TargetType target() const { return target_; }
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
......@@ -257,7 +254,6 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
int64_t base = numel() / dims_[0];
TensorLite dst;
dst.target_ = target_;
auto dst_dims = dims_;
dst_dims[0] = end - begin;
......
文件模式从 100755 更改为 100644
......@@ -25,6 +25,7 @@ lite_cc_library(mir_passes
elimination/elementwise_mul_constant_eliminate_pass.cc
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
kernel_place_correct_pass.cc
type_target_cast_pass.cc
type_layout_cast_pass.cc
type_precision_cast_pass.cc
......
......@@ -27,10 +27,24 @@ namespace mir {
void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// delete quant node
std::vector<std::string> quant_op_types = {
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
"fake_quantize_abs_max",
"fake_quantize_range_abs_max",
"fake_quantize_moving_average_abs_max"};
/*
for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
for (int i = 5; i >= 1; --i){
fusion::DynamicQuantDequantOpFuser fuser("fake_quantize_abs_max", op_type,
i);
fuser(graph.get());
}
}
*/
for (auto& op_type : quant_op_types) {
fusion::DeleteQuantOpFuser fuser(op_type);
fuser(graph.get());
fusion::DeleteDynamicQuantOpFuser dfuser(op_type);
dfuser(graph.get());
}
// fuse quantized node and dequant node
......
......@@ -77,6 +77,55 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
return op_desc;
}
void DeleteDynamicQuantOpFuser::BuildPattern() {
auto* input_act_node =
VarNode("input_act_node")->assert_is_op_input(quant_op_type_, "X");
auto* quant_node =
OpNode("quant_node", quant_op_type_)->assert_is_op(quant_op_type_);
auto* output_scale_node =
VarNode("output_scale_node")
->assert_is_op_output(quant_op_type_, "OutScale");
auto* output_act_node =
VarNode("output_act_node")->assert_is_op_output(quant_op_type_, "Out");
quant_node->LinksFrom({input_act_node});
output_scale_node->LinksFrom({quant_node});
output_act_node->LinksFrom({quant_node});
VLOG(4) << "DeleteQuantOpFuser BuildPattern quant_op_type:" << quant_op_type_;
}
void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto* input_act_node = matched.at("input_act_node");
auto* quant_node = matched.at("quant_node");
auto* output_scale_node = matched.at("output_scale_node");
auto* output_act_node = matched.at("output_act_node");
// obtain values, save values and relink node
int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
auto* scope = quant_node->stmt()->op()->scope();
auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
->GetMutable<lite::Tensor>();
float scale_value = scale_tensor->data<float>()[0] / range;
auto outlinks = output_act_node->outlinks;
for (auto* quantized_node : outlinks) {
auto* op_desc = quantized_node->stmt()->mutable_op_info();
op_desc->SetAttr<int>("bit_length", bit_length);
IR_NODE_LINK_TO(input_act_node, quantized_node)
}
// delete nodes and edges
std::unordered_set<const Node*> nodes2rm = {
quant_node, output_scale_node, output_act_node};
GraphSafeRemoveNodes(graph, nodes2rm);
}
cpp::OpDesc DeleteDynamicQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc;
return op_desc;
}
void DequantOpFuser::BuildPattern() {
std::string weight_name = "";
if (quantized_op_type_ == "conv2d" ||
......@@ -130,8 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
auto& valid_places = quantized_op->stmt()->op()->valid_places();
int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
float input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
float input_scale = 0;
if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
}
float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
float whole_weight_scale =
static_cast<float>(range * range) / max_range / range;
......@@ -162,8 +214,12 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
for (int i = 0; i < weight_scale_size; i++) {
weight_scale.push_back(whole_weight_scale);
}
#ifndef LITE_WITH_FPGA
op_desc.SetAttr("enable_int8", true);
op_desc.SetAttr("input_scale", input_scale);
#endif
if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
op_desc.SetAttr("input_scale", input_scale);
}
op_desc.SetAttr("weight_scale", weight_scale);
// change the weight from the float type to int8 type.
......@@ -171,12 +227,30 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
temp_tensor.CopyDataFrom(*quantized_weight_t);
float* temp_data = temp_tensor.mutable_data<float>();
size_t weight_num = quantized_weight_t->data_size();
#ifdef LITE_WITH_FPGA
float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
}
quantized_weight_t->set_persistable(true);
quantized_weight_t->set_precision(PRECISION(kFloat));
#else
int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
}
quantized_weight_t->set_persistable(true);
quantized_weight_t->set_precision(PRECISION(kInt8));
#endif
// int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
// for (size_t i = 0; i < weight_num; i++) {
// quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
// }
// quantized_weight_t->set_persistable(true);
// quantized_weight_t->set_precision(PRECISION(kInt8));
// new op and relink nodes
auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
......@@ -464,6 +538,194 @@ cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc;
return op_desc;
}
// ================dynamic quant fuse==============
// #define DYNAMIC_RANGE
void DynamicQuantDequantOpFuser::BuildPattern() {
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
std::string weight_name = "";
if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
weight_name = "Filter";
} else {
weight_name = "Y";
}
auto* quant_op_input = VarNode("quant_op_input")
->assert_is_op_input(quant_type_, "X")
->AsInput();
#ifdef DYNAMIC_RANGE
auto* quant_op_in_scale = VarNode("quant_op_in_scale")
->assert_is_op_input(quant_type_, "InScale")
->AsIntermediate();
#endif
auto* quant_op = OpNode("quant_op", quant_type_)
->assert_is_op(quant_type_)
->AsIntermediate();
auto* quant_op_out_scale =
VarNode("quant_op_out_scale")
->assert_is_op_output(quant_type_, "OutScale")
->assert_is_op_input("fake_dequantize_max_abs", "Scale")
->AsIntermediate();
auto* quant_op_out = VarNode("quant_op_out")
->assert_is_op_output(quant_type_, "Out")
->assert_is_op_input(op_type_)
->AsIntermediate();
std::vector<PMNode*> nodes;
for (int i = 0; i < times_; i++) {
nodes.push_back(VarNode(string_format("quantized_op_weight%d", i))
->assert_is_op_input(op_type_, weight_name)
->AsInput());
nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_)
->assert_is_op(op_type_)
->AsIntermediate());
nodes.push_back(VarNode(string_format("quantized_op_out%d", i))
->assert_is_op_output(op_type_)
->assert_is_op_input("fake_dequantize_max_abs", "X")
->AsIntermediate());
nodes.push_back(
OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs")
->assert_is_op("fake_dequantize_max_abs")
->AsIntermediate());
nodes.push_back(VarNode(string_format("dequant_op_out%d", i))
->assert_is_op_output("fake_dequantize_max_abs", "Out")
->AsOutput());
}
#ifdef DYNAMIC_RANGE
quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
#endif
quant_op->LinksFrom({quant_op_input});
quant_op_out->LinksFrom({quant_op});
quant_op_out_scale->LinksFrom({quant_op});
for (int i = 0; i < times_; i++) {
nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
{quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOffset]});
nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kDequantOpOffset]});
}
}
void DynamicQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
auto* quant_op_input = matched.at("quant_op_input");
#ifdef DYNAMIC_RANGE
auto* quant_op_in_scale = matched.at("quant_op_in_scale");
#endif
auto* quant_op = matched.at("quant_op");
std::vector<Node*> nodes;
for (int i = 0; i < times_; i++) {
nodes.push_back(matched.at(string_format("quantized_op_weight%d", i)));
nodes.push_back(matched.at(string_format("quantized_op%d", i)));
nodes.push_back(matched.at(string_format("quantized_op_out%d", i)));
nodes.push_back(matched.at(string_format("dequant_op%d", i)));
nodes.push_back(matched.at(string_format("dequant_op_out%d", i)));
}
int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length");
auto* scope = quant_op->stmt()->op()->scope();
auto& valid_places = quant_op->stmt()->op()->valid_places();
int range = ((1 << (bit_length - 1)) - 1);
#ifdef DYNAMIC_RANGE
auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name)
->GetMutable<lite::Tensor>();
float input_scale = input_scale_t->data<float>()[0] / range;
VLOG(4) << "range: " << range << " input_scale: " << input_scale;
#endif
for (int i = 0; i < times_; i++) {
float max_range = nodes[i * kNumFields + kDequantOpOffset]
->stmt()
->op_info()
->GetAttr<float>("max_range");
// weight_scale = max(abs(weight))
float whole_weight_scale =
static_cast<float>(range * range) / max_range / range;
cpp::OpDesc op_desc =
*nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info();
auto quantized_weight_var_name =
nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name;
auto quantized_weight_t =
scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
std::vector<float> weight_scale;
int weight_scale_size;
if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name});
op_desc.SetOutput(
"Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
// Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
// be Cout.
weight_scale_size = quantized_weight_t->dims()[0];
} else if (op_type_ == "mul") {
op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name});
op_desc.SetOutput(
"Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
// Fc weight: Cin * Cout, the weight_scale_size should be Cout.
weight_scale_size = quantized_weight_t->dims()[1];
}
for (int i = 0; i < weight_scale_size; i++) {
weight_scale.push_back(whole_weight_scale);
}
// op_desc.SetAttr("enable_int8", true);
// op_desc.SetAttr("input_scale", input_scale);
op_desc.SetAttr("weight_scale", weight_scale);
Tensor temp_tensor;
temp_tensor.CopyDataFrom(*quantized_weight_t);
float* temp_data = temp_tensor.mutable_data<float>();
size_t weight_num = quantized_weight_t->data_size();
quantized_weight_t->set_persistable(true);
std::cout << "DynamicQuantDequantOpFuser::InsertNewNode============================================================" << std::endl;
#ifdef LITE_WITH_FPGA
float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
std::cout << whole_weight_scale << "," << temp_data[i] << "," << quantized_weight_data[i] << std::endl;
}
quantized_weight_t->set_precision(PRECISION(kFloat));
#else
int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
for (size_t i = 0; i < weight_num; i++) {
quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
}
quantized_weight_t->set_precision(PRECISION(kInt8));
#endif
auto quantized_op = LiteOpRegistry::Global().Create(op_type_);
quantized_op->Attach(op_desc, scope);
auto* new_op_node =
graph->GraphCreateInstructNode(quantized_op, valid_places);
IR_NODE_LINK_TO(quant_op_input, new_op_node);
IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset],
new_op_node);
IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]);
}
}
cpp::OpDesc DynamicQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc;
return op_desc;
}
} // namespace fusion
} // namespace mir
......
......@@ -52,6 +52,19 @@ class DeleteQuantOpFuser : public FuseBase {
private:
std::string quant_op_type_{};
};
class DeleteDynamicQuantOpFuser : public FuseBase {
public:
explicit DeleteDynamicQuantOpFuser(const std::string& quant_op_type)
: quant_op_type_(quant_op_type) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string quant_op_type_{};
};
/* DequantOpFuser process conv2d/depthwise_conv2d/mul + fake_dequantize_max_abs.
*/
......@@ -106,6 +119,24 @@ class DeleteQuantDequantOpFuser : public FuseBase {
private:
std::string quantized_op_type_{};
};
// dynamic quantdequant op fuser
class DynamicQuantDequantOpFuser : public FuseBase {
public:
explicit DynamicQuantDequantOpFuser(const std::string& quantized_op_type,
const std::string& op_type,
int i)
: op_type_(op_type), quant_type_(quantized_op_type), times_(i) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string op_type_{};
std::string quant_type_{};
int times_{1};
};
} // namespace fusion
} // namespace mir
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/kernel_place_correct_pass.h"
#include <memory>
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void KernelPlaceCorrectPass::Apply(const std::unique_ptr<SSAGraph> &graph) {
CorrectArgumentPlace(graph.get());
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(kernel_place_correct_pass,
paddle::lite::mir::KernelPlaceCorrectPass)
.BindTargets({TARGET(kFPGA)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
namespace mir {
/*
* Correct the place of the variables in the SSAGrpah, it will inference the
* variables' place by the kernels outputs them.
*/
class KernelPlaceCorrectPass : public DebugPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
private:
void CorrectArgumentPlace(SSAGraph* graph) {
auto& valid_places = graph->valid_places();
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference
// for
// tool operator
if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
VLOG(3) << "inst.op_type() == 'io_copy', continue";
if (inst.op_type() == "io_copy") continue;
}
// deal with inputs
VLOG(4) << "checking op " << inst.op_info()->Repr();
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
bool need_correct_place = true;
std::vector<TargetType> in_types;
std::vector<TargetType> out_types;
for (auto* x_in : x->inlinks) {
std::string node_name = x_in->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
CHECK(arg_name.size() > 0) << "can not found op arguments for node "
<< node_name;
VLOG(4) << "-- input arg_name:" << arg_name << " "
<< "-- node name:" << node_name;
auto type = inst.picked_kernel().GetInputDeclType(arg_name);
if (!x_in->AsArg().type) {
need_correct_place &= false;
} else {
if (in_types.empty()) {
in_types.push_back(x_in->AsArg().type->target());
} else {
if (in_types[0] != x_in->AsArg().type->target()) {
need_correct_place &= false;
}
}
}
}
for (auto* x_out : x->outlinks) {
std::string node_name = x_out->AsArg().name;
std::string arg_name =
get_argname(node_name, inst.op_info()->outputs());
CHECK(arg_name.size() > 0) << "can not found op arguments for node "
<< node_name << " in Inst "
<< inst.op_type();
VLOG(4) << "-- output arg_name " << arg_name;
auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
if (!x_out->AsArg().type) {
need_correct_place &= false;
} else {
if (out_types.empty()) {
out_types.push_back(x_out->AsArg().type->target());
} else {
if (out_types[0] != x_out->AsArg().type->target()) {
need_correct_place &= false;
}
}
}
}
auto this_type = inst.picked_kernel().target();
bool io_target_same = (in_types[0] == out_types[0]);
need_correct_place &= (io_target_same && (in_types[0] != this_type));
if (need_correct_place) {
// update this kernel's valid place;
UpdateTarget(inst, in_types[0]);
}
}
}
// Update me's kUnk fields by other's fields.
void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT
auto new_place = inst.place();
new_place.target = new_target;
std::vector<Place> places;
places.push_back(new_place);
inst.ResetKernels(places);
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -53,6 +53,11 @@ void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
}
valid_kernels_ = op_->CreateKernels(valid_places);
}
void mir::Node::Stmt::ResetKernels(const std::vector<Place> &valid_places) {
CHECK(op_) << "change valid place failed, not created op";
valid_kernels_.clear();
valid_kernels_ = op_->CreateKernels(valid_places);
}
mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
auto &x = AsArg();
......
......@@ -53,6 +53,7 @@ class Node {
const std::vector<Place>& valid_places,
lite::Scope* scope = nullptr);
void ResetKernels(const std::vector<Place>& valid_places);
std::string op_type() const { return op_info()->Type(); }
const OpInfo* op_info() const;
OpInfo* mutable_op_info();
......
文件模式从 100644 更改为 100755
......@@ -76,6 +76,7 @@ class Optimizer {
#endif
"static_kernel_pick_pass", // pick original kernel from graph
"variable_place_inference_pass", // inference arg/var's
"kernel_place_correct_pass",
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
......
......@@ -148,7 +148,7 @@ void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA
// LITE_PRECISION_PROFILE(inst)
LITE_PRECISION_PROFILE(inst)
#endif
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
......
......@@ -28,7 +28,6 @@ namespace arm {
void LookupTableCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
// inputs
auto w = param.W;
auto ids = param.Ids;
......@@ -76,3 +75,13 @@ REGISTER_LITE_KERNEL(lookup_table,
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(lookup_table_v2,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
文件模式从 100755 更改为 100644
......@@ -71,6 +71,13 @@ void ConvCompute::PrepareForRun() {
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
// conv_param.filter->saveToFile("conv_filter_", true);
// if (param.bias != nullptr) {
// std::cout << "param.bias != nullptr" << std::endl;
// conv_param.bias()->saveToFile("conv_bias_", true);
// }
conv_pe_.init();
conv_pe_.apply();
}
......@@ -79,26 +86,18 @@ void ConvCompute::PrepareForRun() {
void ConvCompute::Run() {
auto& param = this->Param<param_t>();
if (param.x->ZynqTensor()->shape().channel() != 1 &&
param.groups == param.x->ZynqTensor()->shape().channel()) {
param.groups == param.x->ZynqTensor()->shape().channel()) {
dw_conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param();
Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
#endif
} else {
zynqmp::ConvParam& conv_param = conv_pe_.param();
if (conv_param.output->shape().channel() == 12 &&
conv_param.output->shape().height() == 13) {
conv_param.input->saveToFile("conv_in", true);
conv_param.output->saveToFile("conv_o", true);
}
// zynqmp::ConvParam& conv_param = conv_pe_.param();
conv_pe_.dispatch();
if (conv_param.output->shape().channel() == 12 &&
conv_param.output->shape().height() == 13) {
// conv_param.input->saveToFile("conv_in", true);
conv_param.output->saveToFile("conv_out", true);
}
#ifdef FPGA_PRINT_TENSOR
// zynqmp::ConvParam& conv_param = conv_pe_.param();
zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output);
#endif
}
......@@ -122,3 +121,17 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(
depthwise_conv2d, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::ConvCompute, def)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -125,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -93,18 +93,22 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
}
// do elementwise add/sub/max...
if (elt_type == "add") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = sum(*din_ptr, diny_data);
dout_ptr++;
din_ptr++;
}
}
// for (int i = 0; i < batch; ++i) {
// for (int j = 0; j < channels; ++j) {
// int offset = (i * channels + j) * num;
// const dtype* din_ptr = x_data + offset;
// const dtype diny_data = y_data[j];
// dtype* dout_ptr = out_data + offset;
// for (int k = 0; k < num; ++k) {
// *dout_ptr = zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr), zynqmp::half_to_float(diny_data)));
// dout_ptr++;
// din_ptr++;
// }
// }
// }
int count= x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
for (int i = 0; i < count; ++i) {
out_data[i] = zynqmp::float_to_half(sum(zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i])));
}
} else if (elt_type == "sub") {
for (int i = 0; i < batch; ++i) {
......@@ -148,9 +152,9 @@ TEST(elementwise_add, compute) {
lite::Tensor x, y, output, output_ref;
for (auto n : {1}) {
for (auto c : {8}) {
for (auto h : {8}) {
for (auto w : {8}) {
for (auto h : {72}) {
for (auto w : {192}) {
for (auto c : {24}) {
for (auto axis : {0}) {
for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
......@@ -174,10 +178,16 @@ TEST(elementwise_add, compute) {
auto* output_ref_data =
output_ref.mutable_data<float16>(TARGET(kFPGA));
for (int i = 0; i < x_dim.production(); i++) {
x_data[i] = zynqmp::float_to_half(i);
float sign = i % 3 == 0 ? -0.03 : 0.05f;
float x = sign * (i % 128);
std::cout << "x:" << x << std::endl;
x_data[i] = zynqmp::float_to_half(x);
}
for (int i = 0; i < y_dim.production(); i++) {
y_data[i] = zynqmp::float_to_half(i);
float sign = i % 3 == 0 ? -0.03 : 0.05f;
float y = sign * (i % 128);
std::cout << "y:" << y << std::endl;
y_data[i] = zynqmp::float_to_half(y);
}
param.X = &x;
param.Y = &y;
......@@ -190,7 +200,8 @@ TEST(elementwise_add, compute) {
elementwise_compute_ref<float16>(param, "add", "");
for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
std::cout << "output_data:" << zynqmp::half_to_float(output_data[i]) << ",output_ref_data:" << zynqmp::half_to_float(output_ref_data[i]) << std::endl;
EXPECT_NEAR(zynqmp::half_to_float(output_data[i]), zynqmp::half_to_float(output_ref_data[i]), 1e-5);
}
}
}
......@@ -209,73 +220,73 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
ASSERT_TRUE(fusion_elementwise_add_activation.front());
}
TEST(fusion_elementwise_add_activation_fpga, init) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation;
ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
}
// TEST(fusion_elementwise_add_activation_fpga, init) {
// ElementwiseAddActivationCompute fusion_elementwise_add_activation;
// ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
// ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
// }
TEST(fusion_elementwise_add_activation_fpga, compute) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation;
operators::FusionElementwiseActivationParam param;
lite::Tensor x, y, output, output_ref;
// TEST(fusion_elementwise_add_activation_fpga, compute) {
// ElementwiseAddActivationCompute fusion_elementwise_add_activation;
// operators::FusionElementwiseActivationParam param;
// lite::Tensor x, y, output, output_ref;
for (auto act_type : {"relu"}) {
for (auto n : {1}) {
for (auto c : {8}) {
for (auto h : {8}) {
for (auto w : {8}) {
for (auto axis : {0}) {
for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
auto y_dim = DDim(yd);
int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
// for (auto act_type : {"relu"}) {
// for (auto n : {1}) {
// for (auto c : {8}) {
// for (auto h : {8}) {
// for (auto w : {8}) {
// for (auto axis : {0}) {
// for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
// auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
// auto y_dim = DDim(yd);
// int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
if (axis_t + y_dim.size() > 4) continue;
bool flag = false;
for (int i = 0; i < y_dim.size(); i++) {
if (x_dim[i + axis_t] != y_dim[i]) flag = true;
}
if (flag) continue;
// if (axis_t + y_dim.size() > 4) continue;
// bool flag = false;
// for (int i = 0; i < y_dim.size(); i++) {
// if (x_dim[i + axis_t] != y_dim[i]) flag = true;
// }
// if (flag) continue;
x.Resize(x_dim);
y.Resize(y_dim);
output.Resize(x_dim);
output_ref.Resize(x_dim);
auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
auto* output_ref_data =
output_ref.mutable_data<float16>(TARGET(kFPGA));
for (int i = 0; i < x_dim.production(); i++) {
float sign = i % 3 == 0 ? -1.0f : 1.0f;
x_data[i] = zynqmp::float_to_half(i * sign);
}
for (int i = 0; i < y_dim.production(); i++) {
float sign = i % 2 == 0 ? 0.5f : -0.5f;
y_data[i] = zynqmp::float_to_half(i * sign);
}
param.X = &x;
param.Y = &y;
param.axis = axis;
param.Out = &output;
param.act_type = act_type;
fusion_elementwise_add_activation.SetParam(param);
fusion_elementwise_add_activation.PrepareForRun();
fusion_elementwise_add_activation.Run();
param.Out = &output_ref;
elementwise_compute_ref<float16>(param, "add", act_type);
for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
}
}
}
}
}
}
}
}
}
// x.Resize(x_dim);
// y.Resize(y_dim);
// output.Resize(x_dim);
// output_ref.Resize(x_dim);
// auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
// auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
// auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
// auto* output_ref_data =
// output_ref.mutable_data<float16>(TARGET(kFPGA));
// for (int i = 0; i < x_dim.production(); i++) {
// float sign = i % 3 == 0 ? -1.0f : 1.0f;
// x_data[i] = zynqmp::float_to_half(i * sign);
// }
// for (int i = 0; i < y_dim.production(); i++) {
// float sign = i % 2 == 0 ? 0.5f : -0.5f;
// y_data[i] = zynqmp::float_to_half(i * sign);
// }
// param.X = &x;
// param.Y = &y;
// param.axis = axis;
// param.Out = &output;
// param.act_type = act_type;
// fusion_elementwise_add_activation.SetParam(param);
// fusion_elementwise_add_activation.PrepareForRun();
// fusion_elementwise_add_activation.Run();
// param.Out = &output_ref;
// elementwise_compute_ref<float16>(param, "add", act_type);
// for (int i = 0; i < output.dims().production(); i++) {
// EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
// }
// }
// }
// }
// }
// }
// }
// }
// }
} // namespace fpga
} // namespace kernels
......@@ -283,4 +294,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) {
} // namespace paddle
USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
// USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
文件模式从 100755 更改为 100644
......@@ -191,8 +191,6 @@ class IoCopyFpgaToHostCHWCompute
param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
// param.x->ZynqTensor()->saveToFile("io_x", true);
// param.y->ZynqTensor()->saveToFile("io_y", true);
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
......
......@@ -78,7 +78,6 @@ void PriorBoxCompute::PrepareForRun() {
param.boxes->mutable_data<float>();
param.variances->mutable_data<float>();
zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor();
priobox_param.image = param.image->ZynqTensor();
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
......@@ -63,26 +63,6 @@ REGISTER_LITE_KERNEL(reshape,
DATALAYOUT(kAny))})
.Finalize();
// REGISTER_LITE_KERNEL(reshape,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::host::ReshapeCompute,
// def)
// .BindInput("X",
// {LiteType::GetTensorTy(
// TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
// .BindInput("ShapeTensor",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
// .BindInput("Shape",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
// .Finalize();
REGISTER_LITE_KERNEL(reshape2,
kHost,
kAny,
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -23,3 +23,5 @@ namespace operators {} // namespace operators
REGISTER_LITE_OP(fake_quantize_range_abs_max,
paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
REGISTER_LITE_OP(fake_quantize_abs_max,
paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
......@@ -40,13 +40,15 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
auto x = op_desc.Input("X").front();
auto in_scale = op_desc.Input("InScale").front();
if (op_desc.HasInput("InScale")) {
auto in_scale = op_desc.Input("InScale").front();
param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
}
auto out = op_desc.Output("Out").front();
auto out_scale = op_desc.Output("OutScale").front();
param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
......
......@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
# global variables
BUILD_EXTRA=ON
BUILD_EXTRA=OFF
BUILD_JAVA=ON
BUILD_PYTHON=OFF
BUILD_DIR=$(pwd)
......
......@@ -29,7 +29,6 @@ namespace zynqmp {
class ConvPE : public PE {
public:
bool init() {
std::cout << "Conv init" << std::endl;
return true;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册