提交 8642fb8a 编写于 作者: T tensor-tang

Merge remote-tracking branch 'gitlab/develop' into incubate/lite

......@@ -10,7 +10,10 @@ paddle/fluid/operators/distributed/send_recv.proto
*.vs
build/
build_doc/
build.*
*.user
*.sh
*.bkp
.vscode
.idea
......
......@@ -43,7 +43,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
if(NOT DEFINED TARGET_ARCH_ABI)
set(ARCH_ABI "arm64-v8a" CACHE STRING "Choose android platform")
endif()
include(cross_compiling/host)
include(cross_compiling/armlinux)
include(cross_compiling/android)
......
......@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_desc.h"
#include <glog/logging.h>
#include <algorithm>
#include <functional>
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include <utility>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
......
......@@ -32,9 +32,9 @@ void Run(const char* model_dir) {
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
input_tensor->Resize(DDim(std::vector<DDim::value_type>({3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < 100 * 100; i++) {
for (int i = 0; i < 3 * 224 * 224; i++) {
data[i] = i;
}
......@@ -65,6 +65,14 @@ USE_LITE_OP(feed);
USE_LITE_OP(fetch);
USE_LITE_OP(io_copy);
USE_LITE_OP(con2d);
// USE_LITE_OP(batch_norm);
USE_LITE_OP(relu);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(pool2d);
USE_LITE_OP(elementwise_add);
USE_LITE_OP(softmax);
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
......@@ -72,7 +80,15 @@ USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(con2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_con2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
// USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
// USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
#endif // LITE_WITH_ARM
......
......@@ -72,8 +72,9 @@ class LightPredictor {
// Create the kernels of the target places, and filter out the specific
// kernel with the target alias.
for (auto& op : program.ops()) {
auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
for (auto& op : program.ops_) {
lite::pb::OpDesc desc(op->op_info()->desc());
auto kernel_type = desc.GetAttr(kKernelTypeAttr).get<std::string>();
std::string op_type, alias;
Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
......@@ -88,8 +89,8 @@ class LightPredictor {
insts.emplace_back(op, std::move(*it));
}
program_.reset(new RuntimeProgram(std::move(insts)));
CHECK(program.exec_scope());
program_->set_exec_scope(program.exec_scope());
CHECK(program.exec_scope_);
program_->set_exec_scope(program.exec_scope_);
}
private:
......
......@@ -6,4 +6,31 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
return()
endif()
cc_library(math_arm SRCS funcs.cc packed_sgemm.cc softmax.cc scale.cc elementwise.cc DEPS ${lite_kernel_deps} eigen3)
# TODO(xxx): seperate them
cc_library(math_arm SRCS
funcs.cc
packed_sgemm.cc
softmax.cc
scale.cc
pooling.cc
elementwise.cc
sgemv.cc
type_trans.cpp
conv_impl.cc
conv_direct_3x3s1.cc
conv_direct_3x3s2.cc
conv_direct.cc
conv_depthwise_3x3_int7.cc
conv_depthwise_3x3_int8.cc
conv_depthwise_5x5s1_int8.cc
conv_depthwise_3x3p0.cc
conv_depthwise_3x3p1.cc
conv_depthwise_5x5s1.cc
conv_depthwise_5x5s2.cc
conv_depthwise.cc
conv_gemmlike.cc
conv_winograd_3x3.cc
conv_winograd.cc
split.cc
DEPS ${lite_kernel_deps} eigen3)
此差异已折叠。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
// !pooling fp32 Op
void pooling_basic(const void* din, void* dout, int num, int chout, int hout,
int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling_global(const void* din, void* dout, int num, int chout, int hout,
int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling2x2s2_max(const void* din, void* dout, int num, int chout, int hout,
int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling2x2s2_ave(const void* din, void* dout, int num, int chout, int hout,
int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling3x3s1p1_max(const void* din, void* dout, int num, int chout,
int hout, int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling3x3s1p1_ave(const void* din, void* dout, int num, int chout,
int hout, int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling3x3s2p1_max(const void* din, void* dout, int num, int chout,
int hout, int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling3x3s2p0_max(const void* din, void* dout, int num, int chout,
int hout, int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling3x3s2p1_ave(const void* din, void* dout, int num, int chout,
int hout, int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
void pooling3x3s2p0_ave(const void* din, void* dout, int num, int chout,
int hout, int wout, int chin, int hin, int win,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, bool global_pooling,
bool exclusive, bool adaptive, bool ceil_mode,
bool use_quantizer, const std::string& pooling_type);
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/arm/math/split.h"
#include <algorithm>
#include "paddle/fluid/lite/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <>
void split_cpy<float>(const float* din, float* dout, int num) {
int cnt = num >> 4;
int remain = num % 16;
#pragma omp parallel for
for (int i = 0; i < cnt; i++) {
const float* din_ptr = din + (i << 4);
float* dout_ptr = dout + (i << 4);
float32x4_t din0 = vld1q_f32(din_ptr);
float32x4_t din1 = vld1q_f32(din_ptr + 4);
float32x4_t din2 = vld1q_f32(din_ptr + 8);
float32x4_t din3 = vld1q_f32(din_ptr + 12);
vst1q_f32(dout_ptr, din0);
vst1q_f32(dout_ptr + 4, din1);
vst1q_f32(dout_ptr + 8, din2);
vst1q_f32(dout_ptr + 12, din3);
}
if (remain > 0) {
const float* din_ptr = din + (cnt << 4);
float* dout_ptr = dout + (cnt << 4);
for (int i = 0; i < remain; i++) {
*dout_ptr = *din_ptr;
dout_ptr++;
din_ptr++;
}
}
}
template <>
void split<float>(const float* din, std::vector<lite::Tensor*>* dout,
const int axis, const std::vector<int>& in_strides) {
int input_offset = 0;
for (auto out : *dout) {
auto out_dim = out->dims();
std::vector<int> out_strides(out_dim.size());
out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
for (int i = out_dim.size() - 2; i >= 0; --i) {
out_strides[i] = out_strides[i + 1] * out_dim[i];
}
float* out_data = out->mutable_data<float>();
int before = out_strides[0] / out_strides[axis];
int in_after = in_strides[axis];
int out_after = out_strides[axis];
for (int i = 0; i < before; ++i) {
split_cpy(din + input_offset + i * in_after, out_data + i * out_after,
out_after);
}
input_offset += out_strides[axis];
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/lite/core/op_lite.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <typename T>
void split_cpy(const T* din, T* dout, int num);
template <typename T>
void split(const T* din, std::vector<lite::Tensor*>* dout, const int axis,
const std::vector<int>& in_strides);
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
此差异已折叠。
......@@ -65,6 +65,8 @@ class Buffer {
TargetCopy(target_, data_, other.data_, nbytes);
}
~Buffer() { Free(); }
private:
// memory it actually malloced.
size_t space_{0};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/core/mir/pattern_matcher.h"
#include <gtest/gtest.h>
namespace paddle {
namespace lite {
namespace mir {
void BuildGraph(SSAGraph* g) {
g->mutable_nodes().emplace_back();
Node& o1 = g->mutable_nodes().back();
o1.AsStmt().op_type = "op1";
g->mutable_nodes().emplace_back();
Node& o2 = g->mutable_nodes().back();
o2.AsStmt().op_type = "op2";
g->mutable_nodes().emplace_back();
Node& o3 = g->mutable_nodes().back();
o3.AsStmt().op_type = "op3";
g->mutable_nodes().emplace_back();
Node& o4 = g->mutable_nodes().back();
o4.AsStmt().op_type = "op4";
g->mutable_nodes().emplace_back();
Node& o5 = g->mutable_nodes().back();
o5.AsStmt().op_type = "op5";
g->mutable_nodes().emplace_back();
Node& v1 = g->mutable_nodes().back();
v1.AsArg("var1");
g->mutable_nodes().emplace_back();
Node& v2 = g->mutable_nodes().back();
v2.AsArg("var2");
g->mutable_nodes().emplace_back();
Node& v3 = g->mutable_nodes().back();
v3.AsArg("var3");
g->mutable_nodes().emplace_back();
Node& v4 = g->mutable_nodes().back();
v4.AsArg("var4");
// o1->v1->o2
o1.outlinks.push_back(&v1);
o2.inlinks.push_back(&v1);
v1.inlinks.push_back(&o1);
v1.outlinks.push_back(&o2);
// o2->v2->o3
// o2->v2->o4
o2.outlinks.push_back(&v2);
o3.inlinks.push_back(&v2);
o4.inlinks.push_back(&v2);
v2.inlinks.push_back(&o2);
v2.outlinks.push_back(&o3);
v2.outlinks.push_back(&o4);
// o2->v3->o5
o2.outlinks.push_back(&v3);
o5.inlinks.push_back(&v3);
v3.inlinks.push_back(&o2);
v3.outlinks.push_back(&o5);
// o3-v4->o5
o3.outlinks.push_back(&v4);
o5.inlinks.push_back(&v4);
v4.inlinks.push_back(&o3);
v4.outlinks.push_back(&o5);
}
TEST(PMPattern, NewNode) {
PMPattern x;
auto* n = x.NewNode([](const Node* x) { return true; });
ASSERT_TRUE(n);
ASSERT_EQ(x.nodes_.size(), 1UL);
}
TEST(PMPattern, AddEdge) {
PMPattern x;
auto* a = x.NewNode([](const Node* x) { return true; });
auto* b = x.NewNode([](const Node* x) { return true; });
ASSERT_TRUE(a);
ASSERT_TRUE(b);
x.AddEdge(a, b);
ASSERT_EQ(x.nodes_.size(), 2UL);
ASSERT_EQ(x.edges_.size(), 1UL);
ASSERT_EQ(x.edges_.front().first, a);
ASSERT_EQ(x.edges_.front().second, b);
ASSERT_EQ(x.nodes().size(), 2UL);
ASSERT_EQ(x.edges().size(), 1UL);
ASSERT_EQ(x.edges().front().first, a);
ASSERT_EQ(x.edges().front().second, b);
}
TEST(PatternMatcher, MarkPMNodesInGraph) {
PatternMatcher x;
// mark o2, o3, v2
// The pattern is a graph:
// o2(a node named o2) -> v2(a node named v2)
// v2 -> o3(a node named o3)
auto* o2 = x.pattern_.NewNode([](const Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->IsStmt() && node->stmt()->op_type == "op2";
});
auto* o3 = x.pattern_.NewNode([](const Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->IsStmt() && node->stmt()->op_type == "op3";
});
auto* v2 = x.pattern_.NewNode([](const Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->IsArg() && node->arg()->name == "var2";
});
ASSERT_FALSE(o2->Tell(nullptr));
ASSERT_FALSE(o3->Tell(nullptr));
ASSERT_FALSE(v2->Tell(nullptr));
x.pattern_.AddEdge(o2, v2);
x.pattern_.AddEdge(v2, o3);
ASSERT_EQ(x.pattern_.edges().size(), 2UL);
ASSERT_EQ(x.pattern_.edges()[0].first, o2);
ASSERT_EQ(x.pattern_.edges()[0].second, v2);
ASSERT_EQ(x.pattern_.edges()[1].first, v2);
ASSERT_EQ(x.pattern_.edges()[1].second, o3);
SSAGraph graph;
BuildGraph(&graph);
x.MarkPMNodesInGraph(&graph);
ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL);
auto subgraphs = x.DetectPatterns();
ASSERT_EQ(subgraphs.size(), 1UL);
}
TEST(PatternMatcher, MultiSubgraph) {
SSAGraph graph;
BuildGraph(&graph);
PatternMatcher x;
// The pattern is a graph:
// op -> var
auto* any_op = x.mutable_pattern()->NewNode(
[](const Node* node) {
return node->IsStmt() && (node->stmt()->op_type == "op2" ||
node->stmt()->op_type == "op3");
},
"OP0");
auto* any_var =
x.mutable_pattern()
->NewNode([](const Node* node) { return node->IsArg(); }, "VAR")
->AsIntermediate();
auto* any_op1 = x.mutable_pattern()->NewNode(
[](const Node* node) { return node->IsStmt(); }, "OP1");
x.mutable_pattern()->AddEdge(any_op, any_var);
x.mutable_pattern()->AddEdge(any_var, any_op1);
int count = 0;
PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
SSAGraph* g) {
LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> "
<< s.at(any_var)->arg()->name << " -> "
<< s.at(any_op1)->stmt()->op_type;
count++;
};
x(&graph, handle);
// 1. Detect op3 -> var4 -> op5
// 2. Detect op2 -> var2 -> op3
// 3. Detect op2 -> var2 -> op4
// 4. Detect op2 -> var3 -> op5
// But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
ASSERT_GE(count, 1);
ASSERT_LE(count, 2);
}
TEST(PatternMatcher, IntermediateCheck) {
SSAGraph graph;
BuildGraph(&graph);
// o2->v2->o3
// o2->v2->o4
// check o2+o3 fuse, should fail because v2 also link to o4.
PatternMatcher matcher;
auto* op2 = matcher.mutable_pattern()->NewNode(
[](const Node* x) {
return x && x->IsStmt() && x->stmt()->op_type == "op2";
},
"op2");
auto* op3 = matcher.mutable_pattern()->NewNode(
[](const Node* x) {
return x && x->IsStmt() && x->stmt()->op_type == "op3";
},
"op3");
auto* v2 = matcher.mutable_pattern()
->NewNode(
[](const Node* x) {
return x && x->IsArg() && x->arg()->name == "var2";
},
"var2")
->AsIntermediate();
v2->LinksFrom({op2}).LinksTo({op3});
int count = 0;
matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
++count;
});
EXPECT_EQ(count, 0);
count = 0;
v2->AsInput();
matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
++count;
});
ASSERT_EQ(count, 1);
}
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -91,9 +91,9 @@ class KernelRegistry final {
void Register(const std::string &name,
typename KernelRegistryForTarget<Target, Precision,
Layout>::creator_t &&creator) {
// VLOG(3) << "register for " << TargetToStr(Target) << ":"
//<< PrecisionToStr(Precision) << "//"
//<< GetKernelOffset<Target, Precision, Layout>();
VLOG(3) << "register for " << TargetToStr(Target) << ":"
<< PrecisionToStr(Precision) << "//"
<< GetKernelOffset<Target, Precision, Layout>();
using kernel_registor_t =
KernelRegistryForTarget<Target, Precision, Layout>;
auto &varient = registries_[GetKernelOffset<Target, Precision, Layout>()];
......@@ -153,6 +153,12 @@ class KernelRegistor : public lite::Registor<KernelType> {
public:
KernelRegistor(const std::string &op_type, const std::string &alias)
: Registor<KernelType>([=] {
<<<<<<< HEAD
=======
VLOG(3) << "Register kernel " << op_type << " for "
<< TargetToStr(target) << " " << PrecisionToStr(precision)
<< " " << DataLayoutToStr(layout) << " alias " << alias;
>>>>>>> gitlab/develop
KernelRegistry::Global().Register<target, precision, layout>(
op_type, [=]() -> std::unique_ptr<KernelType> {
std::unique_ptr<KernelType> x(new KernelType);
......
......@@ -21,6 +21,7 @@
* looks the same.
*/
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/target_wrapper.h"
......
......@@ -9,12 +9,18 @@ cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
set(arm_kernels
fc_compute_arm
......@@ -22,6 +28,11 @@ set(arm_kernels
mul_compute_arm
scale_compute_arm
softmax_compute_arm
elementwise_add_compute_arm)
conv_compute_arm
elementwise_add_compute_arm
pool_compute_arm
split_compute_arm
)
set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void ConvCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
auto w_dims = param.filter->dims();
auto o_dims = param.output->dims();
auto& ctx = this->ctx_->template As<ARMContext>();
int win = x_dims[3]; // nchw
int hin = x_dims[2];
int ic = x_dims[1];
int bs = x_dims[0];
int ow = o_dims[3];
int oh = o_dims[2];
int oc = o_dims[1];
int kh = w_dims[2]; // oihw
int kw = w_dims[3];
int pad = param.paddings[0];
int stride = param.strides[0];
const auto* i_data = param.x->data<float>();
const auto* w_data = param.filter->data<float>();
const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
auto* o_data = param.output->mutable_data<float>();
bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
(param.strides[0] == param.strides[1]) && (kw == kh);
bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
bool flag_dw_3x3 =
(kw == 3 && (pad == 0 || pad == 1) && (stride == 1 || stride == 2));
bool flag_dw_5x5 =
(kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2);
bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
// select conv impl
if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
// dw conv impl
impl_ = new lite::arm::math::DepthwiseConv<PRECISION(kFloat)>;
VLOG(3) << "invoking dw conv";
} else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
no_dilation) {
if (ic >= 32 && oc >= 32 && oh > 16 && ow > 16) {
// winograd conv impl
impl_ = new lite::arm::math::WinogradConv<PRECISION(kFloat)>;
VLOG(3) << "invoking winograd conv";
} else {
// direct conv impl
impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
VLOG(3) << "invoking direct conv";
}
} else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal &&
no_dilation) {
// direct conv impl
impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
VLOG(3) << "invoking direct conv";
} else {
impl_ = new lite::arm::math::GemmLikeConv<PRECISION(kFloat)>;
VLOG(3) << "invoking gemm like conv";
}
CHECK(this->impl_->create(param, &ctx));
}
void ConvCompute::Run() {
auto& param = this->Param<param_t>();
CHECK(impl_);
impl_->run(param);
// if (this->act_ != nullptr) {
// this->act_->run(outputs, outputs, param.activation_param);
// }
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW,
paddle::lite::kernels::arm::ConvCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW,
paddle::lite::kernels::arm::ConvCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/operators/conv_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class ConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::ConvParam;
void PrepareForRun() override;
void Run() override;
~ConvCompute() {
if (impl_ != nullptr) {
delete impl_;
}
}
private:
lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kFloat), param_t>* impl_{
nullptr};
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
template <typename dtype>
void conv_compute_ref(const operators::ConvParam& param) {
auto input = param.x;
auto filter = param.filter;
auto output = param.output;
DDim input_dims = param.x->dims();
DDim filter_dims = param.filter->dims();
DDim output_dims = param.output->dims();
std::vector<int> paddings = param.paddings;
std::vector<int> strides = param.strides;
std::vector<int> dilations = param.dilations;
int groups = param.groups;
auto input_data = param.x->data<float>();
auto output_data = param.output->mutable_data<float>();
auto filter_data = param.filter->mutable_data<float>();
const float* bias_data = nullptr;
if (param.bias != nullptr) {
bias_data = param.bias->mutable_data<float>();
}
bool flag_bias = bias_data != nullptr;
bool flag_relu = false; // TODO(hong19860320) param.relu
int num = input_dims[0];
int chout = output_dims[1];
int hout = output_dims[2];
int wout = output_dims[3];
int chin = input_dims[1];
int hin = input_dims[2];
int win = input_dims[3];
int out_c_group = chout / groups;
int in_c_group = chin / groups;
int stride_h = strides[0];
int stride_w = strides[1];
int dilation_h = dilations[0];
int dilation_w = dilations[1];
int padding_h = paddings[0];
int padding_w = paddings[1];
int kernel_h = filter_dims[2];
int kernel_w = filter_dims[3];
for (int n = 0; n < num; ++n) {
for (int g = 0; g < groups; ++g) {
for (int oc = 0; oc < out_c_group; ++oc) {
for (int oh = 0; oh < hout; ++oh) {
for (int ow = 0; ow < wout; ++ow) {
int out_idx = n * groups * out_c_group * hout * wout +
g * out_c_group * hout * wout + oc * hout * wout +
oh * wout + ow;
output_data[out_idx] =
flag_bias ? static_cast<float>(bias_data[g * out_c_group + oc])
: 0.f;
for (int ic = 0; ic < in_c_group; ++ic) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int iw = ow * stride_w - padding_w + kw * (dilation_w);
int ih = oh * stride_h - padding_h + kh * (dilation_h);
if (iw < 0 || iw >= win) continue;
if (ih < 0 || ih >= hin) continue;
int iidx = n * chin * hin * win + g * in_c_group * hin * win +
ic * hin * win + ih * win + iw;
int widx =
g * out_c_group * in_c_group * kernel_h * kernel_w +
oc * in_c_group * kernel_h * kernel_w +
ic * kernel_h * kernel_w + kh * kernel_w + kw;
output_data[out_idx] +=
(dtype)input_data[iidx] * (dtype)filter_data[widx];
}
}
}
if (flag_relu) {
output_data[out_idx] =
output_data[out_idx] > 0.f ? output_data[out_idx] : 0.f;
}
}
}
}
}
}
}
TEST(conv_arm, retrive_op) {
auto conv = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
"conv2d");
ASSERT_FALSE(conv.empty());
ASSERT_TRUE(conv.front());
}
TEST(conv_arm, init) {
ConvCompute conv;
ASSERT_EQ(conv.precision(), PRECISION(kFloat));
ASSERT_EQ(conv.target(), TARGET(kARM));
}
TEST(conv_arm, compute) {
DeviceInfo::Init();
for (auto n : {1, 2}) {
for (auto ic : {6, 32 /*, 128*/}) {
for (auto oc : {6, 32 /*, 128*/}) {
for (auto ih : {9, 18 /*, 56 , 112, 224, 512*/}) {
for (auto iw : {9, 18 /*, 56, 112, 224, 512*/}) {
for (auto flag_bias : {false, true}) {
for (auto flag_relu : {false, true}) {
for (auto depthwise : {false, true}) {
for (auto dilation : {1, 2}) {
for (auto stride : {1, 2}) {
for (auto padding : {0, 1, 2}) {
for (auto ks : {1, 3, 5}) {
int group = 1;
if (depthwise) { // depthwise convolution ?
group = oc = ic;
}
// get input, filter and output shape
std::vector<int64_t> input_shape = {n, ic, ih, iw};
std::vector<int64_t> filter_shape = {oc, ic / group,
ks, ks};
const int dks = dilation * (ks - 1) + 1;
int oh = (ih + 2 * padding - dks) / stride + 1;
int ow = (iw + 2 * padding - dks) / stride + 1;
std::vector<int64_t> output_shape({n, oc, oh, ow});
// resize input, filter and output
Tensor input;
Tensor filter;
Tensor bias;
Tensor output;
Tensor output_ref;
input.Resize(input_shape);
filter.Resize(filter_shape);
output.Resize(output_shape);
output_ref.Resize(output_shape);
VLOG(3) << "input: " << input.dims();
VLOG(3) << "filter: " << filter.dims()
<< " padding:" << padding
<< " stride:" << stride
<< " dilation:" << dilation;
VLOG(3) << "output: " << output.dims();
auto* input_data = input.mutable_data<float>();
auto* filter_data = filter.mutable_data<float>();
auto* output_data = output.mutable_data<float>();
for (int i = 0; i < input.dims().production(); i++) {
input_data[i] = static_cast<float>(i % 128);
}
for (int i = 0; i < filter.dims().production(); i++) {
filter_data[i] =
i * 0.001f /
static_cast<float>(filter.dims().production());
}
// prepare kernel params and run
ConvCompute conv;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
conv.SetContext(std::move(ctx));
operators::ConvParam param;
param.x = &input;
param.filter = &filter;
param.output = &output;
param.bias = nullptr;
if (flag_bias) {
bias.Resize({oc});
auto* bias_data = bias.mutable_data<float>();
for (int i = 0; i < bias.dims().production(); i++) {
bias_data[i] = static_cast<float>(i);
}
param.bias = &bias;
}
// TODO(hong19860320) param.relu = flag_relu;
param.paddings = std::vector<int>({padding, padding});
param.strides = std::vector<int>({stride, stride});
param.dilations =
std::vector<int>({dilation, dilation});
param.groups = group;
conv.SetParam(param);
conv.Launch();
// invoking ref implementation and compare results
param.output = &output_ref;
conv_compute_ref<float>(param);
auto* output_ref_data =
output_ref.mutable_data<float>();
for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i],
1e-3);
}
}
}
}
}
}
}
}
}
}
}
}
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -19,5 +19,6 @@ USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册