提交 0f03e23b 编写于 作者: C Chunwei

framework support cl

上级 17ef40f3
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/lite/utils/string.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -84,7 +85,8 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const { ...@@ -84,7 +85,8 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
auto marked_nodes = ConsumeMarkedNodes(graph); auto marked_nodes = ConsumeMarkedNodes(graph);
// Create nodes // Create nodes
for (const Node* n : graph->Nodes()) { for (const Node* n : graph->Nodes()) {
std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")"; std::string node_id =
lite::string_format("%s(%d)", FormatName(n).c_str(), n->id());
if (n->IsOp()) { if (n->IsOp()) {
decltype(op_attrs) attr = decltype(op_attrs) attr =
marked_nodes.count(n) ? marked_op_attrs : op_attrs; marked_nodes.count(n) ? marked_op_attrs : op_attrs;
......
...@@ -58,9 +58,11 @@ class Dot { ...@@ -58,9 +58,11 @@ class Dot {
std::vector<Attr> attrs; std::vector<Attr> attrs;
Node(const std::string& name, const std::vector<Attr>& attrs) Node(const std::string& name, const std::vector<Attr>& attrs)
: name(name), : name(name), attrs(attrs) {
attrs(attrs), std::stringstream ss;
id_("node_" + std::to_string(dot_node_counter++)) {} ss << "node_" << dot_node_counter++;
id_ = ss.str();
}
std::string id() const { return id_; } std::string id() const { return id_; }
......
...@@ -37,7 +37,7 @@ endfunction() ...@@ -37,7 +37,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -78,6 +78,12 @@ function (lite_deps TARGET) ...@@ -78,6 +78,12 @@ function (lite_deps TARGET)
endforeach(var) endforeach(var)
endif() endif()
if (LITE_WITH_OPENCL)
foreach(var ${lite_deps_CL_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE) set(${TARGET} ${deps} PARENT_SCOPE)
endfunction() endfunction()
......
...@@ -53,6 +53,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ...@@ -53,6 +53,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
lite_cc_test(test_mobilenetv1_lite SRCS mobilenetv1_test.cc lite_cc_test(test_mobilenetv1_lite SRCS mobilenetv1_test.cc
DEPS ${lite_model_test_DEPS} DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL) ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
add_dependencies(test_mobilenetv1_lite extern_lite_download_mobilenet_v1_tar_gz) add_dependencies(test_mobilenetv1_lite extern_lite_download_mobilenet_v1_tar_gz)
......
...@@ -23,10 +23,7 @@ namespace paddle { ...@@ -23,10 +23,7 @@ namespace paddle {
namespace lite { namespace lite {
void Predictor::SaveModel(const std::string &dir) { void Predictor::SaveModel(const std::string &dir) {
#ifndef LITE_WITH_ARM
MkDirRecur(dir); MkDirRecur(dir);
#else
#endif
program_->PersistModel(dir, program_desc_); program_->PersistModel(dir, program_desc_);
LOG(INFO) << "Save model to " << dir; LOG(INFO) << "Save model to " << dir;
} }
......
...@@ -25,16 +25,13 @@ ...@@ -25,16 +25,13 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
#ifdef LITE_WITH_ARM void TestModel(const std::vector<Place>& valid_places,
TEST(MobileNetV1, test) { const Place& preferred_place) {
DeviceInfo::Init(); DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads); DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor; lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)}, predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
valid_places);
auto* input_tensor = predictor.GetInput(0); auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224}))); input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
...@@ -70,7 +67,26 @@ TEST(MobileNetV1, test) { ...@@ -70,7 +67,26 @@ TEST(MobileNetV1, test) {
ASSERT_EQ(out->dims()[0], 1); ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000); ASSERT_EQ(out->dims()[1], 1000);
} }
#endif
TEST(MobileNetV1, test_arm) {
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
// Place{TARGET(kOpenCL), PRECISION(kFloat)},
});
TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
}
TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
Place{TARGET(kOpenCL), PRECISION(kFloat)},
});
TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)}));
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,10 @@ ...@@ -14,6 +14,10 @@
#include "paddle/fluid/lite/core/context.h" #include "paddle/fluid/lite/core/context.h"
#ifdef LITE_WITH_OPENCL
DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
#endif
namespace paddle { namespace paddle {
namespace lite {} // namespace lite namespace lite {} // namespace lite
} // namespace paddle } // namespace paddle
...@@ -23,6 +23,11 @@ ...@@ -23,6 +23,11 @@
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#endif #endif
#ifdef LITE_WITH_OPENCL
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_helper.h"
#endif
#include <map> #include <map>
#include <memory> #include <memory>
#include <set> #include <set>
...@@ -34,6 +39,10 @@ ...@@ -34,6 +39,10 @@
#include "paddle/fluid/lite/core/target_wrapper.h" #include "paddle/fluid/lite/core/target_wrapper.h"
#include "paddle/fluid/lite/utils/all.h" #include "paddle/fluid/lite/utils/all.h"
#ifdef LITE_WITH_OPENCL
DECLARE_string(cl_path);
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -44,6 +53,7 @@ using HostContext = Context<TargetType::kHost>; ...@@ -44,6 +53,7 @@ using HostContext = Context<TargetType::kHost>;
using X86Context = Context<TargetType::kX86>; using X86Context = Context<TargetType::kX86>;
using CUDAContext = Context<TargetType::kCUDA>; using CUDAContext = Context<TargetType::kCUDA>;
using ARMContext = Context<TargetType::kARM>; using ARMContext = Context<TargetType::kARM>;
using OpenClContext = Context<TargetType::kOpenCL>;
template <> template <>
class Context<TargetType::kHost> { class Context<TargetType::kHost> {
...@@ -51,7 +61,7 @@ class Context<TargetType::kHost> { ...@@ -51,7 +61,7 @@ class Context<TargetType::kHost> {
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {} void InitOnce() {}
void CopyShared(const HostContext* ctx) {} void CopySharedTo(const HostContext* ctx) {}
std::string name() const { return "HostContext"; } std::string name() const { return "HostContext"; }
}; };
...@@ -69,7 +79,7 @@ class Context<TargetType::kARM> { ...@@ -69,7 +79,7 @@ class Context<TargetType::kARM> {
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() { DeviceInfo::Init(); } void InitOnce() { DeviceInfo::Init(); }
void CopyShared(const ARMContext* ctx) {} void CopySharedTo(const ARMContext* ctx) {}
void SetRunMode(PowerMode mode, int threads) { void SetRunMode(PowerMode mode, int threads) {
return DeviceInfo::Global().SetRunMode(mode, threads); return DeviceInfo::Global().SetRunMode(mode, threads);
...@@ -109,7 +119,7 @@ class Context<TargetType::kCUDA> { ...@@ -109,7 +119,7 @@ class Context<TargetType::kCUDA> {
cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>(); cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
} }
void CopyShared(const CUDAContext* ctx) { void CopySharedTo(const CUDAContext* ctx) {
CHECK(ctx); CHECK(ctx);
CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
ctx->cublas_fp32_ = cublas_fp32_; ctx->cublas_fp32_ = cublas_fp32_;
...@@ -175,7 +185,7 @@ class Context<TargetType::kX86> { ...@@ -175,7 +185,7 @@ class Context<TargetType::kX86> {
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {} void InitOnce() {}
void CopyShared(const X86Context* ctx) {} void CopySharedTo(const X86Context* ctx) {}
const device_ctx_t* x86_device_context() { return x86_device_context_.get(); } const device_ctx_t* x86_device_context() { return x86_device_context_.get(); }
void SetX86DeviceContext(std::unique_ptr<device_ctx_t>&& ctx) { void SetX86DeviceContext(std::unique_ptr<device_ctx_t>&& ctx) {
...@@ -202,6 +212,40 @@ class Context<TargetType::kX86> { ...@@ -202,6 +212,40 @@ class Context<TargetType::kX86> {
}; };
#endif #endif
#ifdef LITE_WITH_OPENCL
template <>
class Context<TargetType::kOpenCL> {
mutable std::shared_ptr<CLContext> cl_context_;
mutable std::shared_ptr<CLHelper> cl_helper_;
public:
CLContext* cl_context() { return cl_context_.get(); }
CLHelper* cl_helper() { return cl_helper_.get(); }
void InitOnce() {
// Init cl engine.
CHECK(CLEngine::Global()->IsInitSuccess()) << "OpenCL engine init failed";
CLEngine::Global()->set_cl_path(FLAGS_cl_path);
cl_context_ = std::make_shared<CLContext>();
cl_helper_ = std::make_shared<CLHelper>();
cl_helper_->set_context(cl_context_.get());
PrepareKernels();
}
void CopySharedTo(const OpenClContext* ctx) {
ctx->cl_context_ = cl_context_;
}
private:
void PrepareKernels() {
cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
cl_helper_->AddKernel("pool_max", "pool_kernel.cl");
}
};
#endif
// Context for running a kernel. // Context for running a kernel.
// Holds the necessary resource and information. // Holds the necessary resource and information.
class KernelContext { class KernelContext {
...@@ -230,26 +274,32 @@ class ContextScheduler { ...@@ -230,26 +274,32 @@ class ContextScheduler {
std::unique_ptr<KernelContext> ctx(new KernelContext); std::unique_ptr<KernelContext> ctx(new KernelContext);
switch (target) { switch (target) {
case TARGET(kHost): case TARGET(kHost):
kernel_contexts_[TargetType::kHost].As<HostContext>().CopyShared( kernel_contexts_[TargetType::kHost].As<HostContext>().CopySharedTo(
&ctx->As<HostContext>()); &ctx->As<HostContext>());
break; break;
#ifdef LITE_WITH_X86 #ifdef LITE_WITH_X86
case TARGET(kX86): case TARGET(kX86):
kernel_contexts_[TargetType::kX86].As<X86Context>().CopyShared( kernel_contexts_[TargetType::kX86].As<X86Context>().CopySharedTo(
&ctx->As<X86Context>()); &ctx->As<X86Context>());
break; break;
#endif #endif
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
case TARGET(kCUDA): case TARGET(kCUDA):
kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopyShared( kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo(
&ctx->As<CUDAContext>()); &ctx->As<CUDAContext>());
break; break;
#endif #endif
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
case TARGET(kARM): case TARGET(kARM):
kernel_contexts_[TargetType::kARM].As<ARMContext>().CopyShared( kernel_contexts_[TargetType::kARM].As<ARMContext>().CopySharedTo(
&ctx->As<ARMContext>()); &ctx->As<ARMContext>());
break; break;
#endif
#ifdef LITE_WITH_OPENCL
case TARGET(kOpenCL):
kernel_contexts_[TargetType::kOpenCL].As<OpenClContext>().CopySharedTo(
&ctx->As<OpenClContext>());
break;
#endif #endif
default: default:
LOG(FATAL) << "unsupported target " << TargetToStr(target); LOG(FATAL) << "unsupported target " << TargetToStr(target);
...@@ -273,6 +323,9 @@ class ContextScheduler { ...@@ -273,6 +323,9 @@ class ContextScheduler {
#endif #endif
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
InitContext<TargetType::kARM, ARMContext>(); InitContext<TargetType::kARM, ARMContext>();
#endif
#ifdef LITE_WITH_OPENCL
InitContext<TargetType::kOpenCL, OpenClContext>();
#endif #endif
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h" #include "paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/lite/utils/string.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -57,24 +58,24 @@ void QuantDequantOpFuser::BuildPattern() { ...@@ -57,24 +58,24 @@ void QuantDequantOpFuser::BuildPattern() {
->AsIntermediate(); ->AsIntermediate();
std::vector<PMNode*> nodes; std::vector<PMNode*> nodes;
for (int i = 0; i < times_; i++) { for (int i = 0; i < times_; i++) {
nodes.push_back(VarNode("quantized_op_weight" + std::to_string(i)) nodes.push_back(VarNode(string_format("quantized_op_weight%d", i))
->assert_is_op_input(op_type_, weight_name) ->assert_is_op_input(op_type_, weight_name)
->AsInput()); ->AsInput());
nodes.push_back(OpNode("quantized_op" + std::to_string(i), op_type_) nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_)
->assert_is_op(op_type_) ->assert_is_op(op_type_)
->AsIntermediate()); ->AsIntermediate());
nodes.push_back(VarNode("quantized_op_out" + std::to_string(i)) nodes.push_back(VarNode(string_format("quantized_op_out%d", i))
->assert_is_op_output(op_type_) ->assert_is_op_output(op_type_)
->assert_is_op_input("fake_dequantize_max_abs", "X") ->assert_is_op_input("fake_dequantize_max_abs", "X")
->AsIntermediate()); ->AsIntermediate());
nodes.push_back( nodes.push_back(
OpNode("dequant_op" + std::to_string(i), "fake_dequantize_max_abs") OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs")
->assert_is_op("fake_dequantize_max_abs") ->assert_is_op("fake_dequantize_max_abs")
->AsIntermediate()); ->AsIntermediate());
nodes.push_back(VarNode("dequant_op_out" + std::to_string(i)) nodes.push_back(VarNode(string_format("dequant_op_out%d", i))
->assert_is_op_output("fake_dequantize_max_abs", "Out") ->assert_is_op_output("fake_dequantize_max_abs", "Out")
->AsOutput()); ->AsOutput());
} }
...@@ -108,11 +109,11 @@ void QuantDequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -108,11 +109,11 @@ void QuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
std::vector<Node*> nodes; std::vector<Node*> nodes;
for (int i = 0; i < times_; i++) { for (int i = 0; i < times_; i++) {
nodes.push_back(matched.at("quantized_op_weight" + std::to_string(i))); nodes.push_back(matched.at(string_format("quantized_op_weight%d", i)));
nodes.push_back(matched.at("quantized_op" + std::to_string(i))); nodes.push_back(matched.at(string_format("quantized_op%d", i)));
nodes.push_back(matched.at("quantized_op_out" + std::to_string(i))); nodes.push_back(matched.at(string_format("quantized_op_out%d", i)));
nodes.push_back(matched.at("dequant_op" + std::to_string(i))); nodes.push_back(matched.at(string_format("dequant_op%d", i)));
nodes.push_back(matched.at("dequant_op_out" + std::to_string(i))); nodes.push_back(matched.at(string_format("dequant_op_out%d", i)));
} }
int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length"); int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length");
auto* scope = quant_op->stmt()->op()->scope(); auto* scope = quant_op->stmt()->op()->scope();
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <set> #include <set>
#include <string> #include <string>
#include "paddle/fluid/lite/core/mir/pass_registry.h" #include "paddle/fluid/lite/core/mir/pass_registry.h"
#include "paddle/fluid/lite/utils/string.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -39,7 +40,7 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -39,7 +40,7 @@ std::string Visualize(mir::SSAGraph* graph) {
if (node.IsArg()) { if (node.IsArg()) {
key = node.AsArg().name; key = node.AsArg().name;
} else { } else {
key = node.AsStmt().op_type() + std::to_string(id++); key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
} }
if (node.IsStmt()) { if (node.IsStmt()) {
......
...@@ -325,7 +325,7 @@ std::string PMPattern::DotString() const { ...@@ -325,7 +325,7 @@ std::string PMPattern::DotString() const {
// Create Nodes // Create Nodes
std::unordered_map<PMNode *, std::string> node2dot; std::unordered_map<PMNode *, std::string> node2dot;
for (const auto &node : nodes()) { for (const auto &node : nodes()) {
std::string node_id = "Node" + std::to_string(id++); std::string node_id = string_format("Node%d", id++);
dot.AddNode(node_id, {}, node->name()); dot.AddNode(node_id, {}, node->name());
node2dot[node.get()] = node_id; node2dot[node.get()] = node_id;
} }
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "paddle/fluid/lite/core/mir/node.h" #include "paddle/fluid/lite/core/mir/node.h"
#include "paddle/fluid/lite/core/mir/ssa_graph.h" #include "paddle/fluid/lite/core/mir/ssa_graph.h"
#include "paddle/fluid/lite/model_parser/pb/op_desc.h" #include "paddle/fluid/lite/model_parser/pb/op_desc.h"
#include "paddle/fluid/lite/utils/string.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -228,7 +229,7 @@ class PMPattern { ...@@ -228,7 +229,7 @@ class PMPattern {
FRIEND_TEST(PMPattern, NewNode); FRIEND_TEST(PMPattern, NewNode);
#endif #endif
static std::string NewID() { return "pmnode-" + std::to_string(id_++); } static std::string NewID() { return string_format("pmnode-%d", id_++); }
std::vector<std::unique_ptr<PMNode>> nodes_; std::vector<std::unique_ptr<PMNode>> nodes_;
std::vector<edge_t> edges_; std::vector<edge_t> edges_;
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h" #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
#include "paddle/fluid/lite/core/mir/pass_registry.h" #include "paddle/fluid/lite/core/mir/pass_registry.h"
#include "paddle/fluid/lite/utils/string.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -80,7 +81,7 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -80,7 +81,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
CHECK(in->IsArg()); CHECK(in->IsArg());
auto node_id = [&] { return graph->nodes().size(); }; auto node_id = [&] { return graph->nodes().size(); };
auto io_copy_output_name = auto io_copy_output_name =
in->AsArg().name + "/trans/" + std::to_string(node_id()); string_format("%s/trans/%d", in->AsArg().name.c_str(), node_id());
auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name); auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
auto* io_copy_inst = graph->NewInstructNode(); auto* io_copy_inst = graph->NewInstructNode();
......
...@@ -30,6 +30,8 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels( ...@@ -30,6 +30,8 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
auto pick_kernel = [&](const Place &place) { auto pick_kernel = [&](const Place &place) {
auto ks = KernelRegistry::Global().Create(op_type_, place.target, auto ks = KernelRegistry::Global().Create(op_type_, place.target,
place.precision, place.layout); place.precision, place.layout);
VLOG(5) << "pick kernel for " << op_info()->Type() << " " << place
<< " get " << ks.size() << " kernels";
for (auto &&it : ks) { for (auto &&it : ks) {
AttachKernel(it.get()); AttachKernel(it.get());
kernels.emplace_back(std::move(it)); kernels.emplace_back(std::move(it));
......
...@@ -62,6 +62,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create( ...@@ -62,6 +62,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
case TARGET(kARM): { case TARGET(kARM): {
CREATE_KERNEL(kARM); CREATE_KERNEL(kARM);
} break; } break;
case TARGET(kOpenCL): {
CREATE_KERNEL(kOpenCL);
} break;
default: default:
CHECK(false) << "not supported kernel target " << TargetToStr(target); CHECK(false) << "not supported kernel target " << TargetToStr(target);
} }
...@@ -99,6 +102,10 @@ KernelRegistry::KernelRegistry() ...@@ -99,6 +102,10 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kARM, kInt8, kNCHW); INIT_FOR(kARM, kInt8, kNCHW);
INIT_FOR(kARM, kAny, kNCHW); INIT_FOR(kARM, kAny, kNCHW);
INIT_FOR(kARM, kAny, kAny); INIT_FOR(kARM, kAny, kAny);
INIT_FOR(kOpenCL, kFloat, kNCHW);
INIT_FOR(kOpenCL, kAny, kNCHW);
INIT_FOR(kOpenCL, kAny, kAny);
#undef INIT_FOR #undef INIT_FOR
} }
......
...@@ -82,6 +82,10 @@ class KernelRegistry final { ...@@ -82,6 +82,10 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kARM), PRECISION(kFloat), KernelRegistryForTarget<TARGET(kARM), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kARM), PRECISION(kInt8), KernelRegistryForTarget<TARGET(kARM), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kOpenCL), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kOpenCL), PRECISION(kInt8),
DATALAYOUT(kNCHW)> * // DATALAYOUT(kNCHW)> * //
>; >;
......
...@@ -52,11 +52,20 @@ class Optimizer { ...@@ -52,11 +52,20 @@ class Optimizer {
RunPasses(std::vector<std::string>{{ RunPasses(std::vector<std::string>{{
"lite_quant_dequant_fuse_pass", // "lite_quant_dequant_fuse_pass", //
"lite_conv_bn_fuse_pass", // "lite_conv_bn_fuse_pass", //
// This pass is disabled to force some opencl kernels selected for final
// running, otherwise, they will be fused to ARM fusion kernels, and the OpenCL
// devices will be discarded.
// TODO(Superjomn) Refine the fusion related design to select fusion kernels for
// devices automatically.
#ifndef LITE_WITH_OPENCL
"lite_conv_elementwise_add_activation_fuse_pass", // "lite_conv_elementwise_add_activation_fuse_pass", //
#endif
"lite_fc_fuse_pass", // "lite_fc_fuse_pass", //
"identity_scale_eliminate_pass", // "identity_scale_eliminate_pass", //
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
#ifndef LITE_WITH_OPENCL
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
#endif
#endif #endif
"static_kernel_pick_pass", // "static_kernel_pick_pass", //
"variable_place_inference_pass", // "variable_place_inference_pass", //
......
...@@ -140,7 +140,8 @@ class RuntimeProgram { ...@@ -140,7 +140,8 @@ class RuntimeProgram {
void Run() { void Run() {
for (auto& inst : instructions_) { for (auto& inst : instructions_) {
VLOG(3) << ">> Running kernel: " << inst.op()->op_info()->Repr(); VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr()
<< " on Target " << TargetToStr(inst.kernel()->target());
inst.Run(); inst.Run();
} }
} }
......
...@@ -31,6 +31,7 @@ enum class TargetType : int { ...@@ -31,6 +31,7 @@ enum class TargetType : int {
kX86, kX86,
kCUDA, kCUDA,
kARM, kARM,
kOpenCL,
kAny, // any target kAny, // any target
NUM, // number of fields. NUM, // number of fields.
}; };
...@@ -69,8 +70,8 @@ static size_t PrecisionTypeLength(PrecisionType type) { ...@@ -69,8 +70,8 @@ static size_t PrecisionTypeLength(PrecisionType type) {
#define DATALAYOUT(item__) paddle::lite::DataLayoutType::item__ #define DATALAYOUT(item__) paddle::lite::DataLayoutType::item__
static const std::string& TargetToStr(TargetType target) { static const std::string& TargetToStr(TargetType target) {
static const std::string target2string[] = {"unk", "host", "x86", static const std::string target2string[] = {"unk", "host", "x86", "cuda",
"cuda", "arm", "any"}; "arm", "opencl", "any"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -92,8 +93,8 @@ static const std::string& DataLayoutToStr(DataLayoutType layout) { ...@@ -92,8 +93,8 @@ static const std::string& DataLayoutToStr(DataLayoutType layout) {
} }
static const std::string& TargetRepr(TargetType target) { static const std::string& TargetRepr(TargetType target) {
static const std::string target2string[] = {"kUnk", "kHost", "kX86", "kCUDA", static const std::string target2string[] = {
"kAny"}; "kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
......
...@@ -4,5 +4,5 @@ add_subdirectory(host) ...@@ -4,5 +4,5 @@ add_subdirectory(host)
add_subdirectory(arm) add_subdirectory(arm)
add_subdirectory(cuda) add_subdirectory(cuda)
add_subdirectory(x86) add_subdirectory(x86)
add_subdirectory(opencl)
if (NOT LITE_WITH_OPENCL)
return ()
endif()
set(cl_kernel_deps op_params_lite cl_caller cl_engine cl_context cl_wrapper)
cc_library(elementwise_add_opencl SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc DEPS elementwise_add_opencl
op_registry_lite program_lite
context_lite
)
set(opencl_kernels
elementwise_add_opencl
CACHE INTERNAL "")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/operators/op_params.h"
// NOTE ugly here, hide these.
#include "paddle/fluid/lite/opencl/cl_caller.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ElementwiseAddCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenClContext>();
CHECK(context.cl_context());
elementwise_add(
context.cl_context(), static_cast<const float*>(param.X->raw_data()),
param.X->dims(), static_cast<const float*>(param.Y->raw_data()),
param.Y->dims(), param.Out->mutable_data<float>(), param.Out->dims());
}
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW,
paddle::lite::kernels::opencl::ElementwiseAddCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
TEST(elementwise_add, init) {
LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create(
"elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
LOG(INFO) << "get kernel";
lite::Tensor X, Y, Out;
operators::ElementwiseParam param;
param.X = &X;
param.Y = &Y;
param.Out = &Out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenClContext>().InitOnce();
kernel->SetParam(param);
kernel->SetContext(std::move(context));
X.Resize({1, 10});
Y.Resize({1, 10});
Out.Resize({1, 10});
auto* x_data = X.mutable_data<float>();
auto* y_data = Y.mutable_data<float>();
auto* out_data = Out.mutable_data<float>();
for (int i = 0; i < 10; i++) {
x_data[i] = 1.1 * i;
y_data[i] = 2.3 * i;
}
kernel->Launch();
for (int i = 0; i < 10; i++) {
EXPECT_NEAR(out_data[i], 3.4 * i, 1e-1);
}
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
...@@ -61,3 +61,6 @@ USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def); ...@@ -61,3 +61,6 @@ USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host); USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
#endif #endif
#ifdef LITE_WITH_OPENCL
USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
#endif
...@@ -2,18 +2,16 @@ if (NOT LITE_WITH_OPENCL) ...@@ -2,18 +2,16 @@ if (NOT LITE_WITH_OPENCL)
return() return()
endif() endif()
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) cc_library(cl_wrapper SRCS cl_wrapper.cc)
cc_library(cl_wrapper SRCS cl_wrapper.cc) cc_library(cl_tool SRCS cl_tool.cc)
cc_library(cl_tool SRCS cl_tool.cc) target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers) cc_library(cl_half SRCS cl_half.cc)
cc_library(cl_half SRCS cl_half.cc) target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing)
target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing) cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool) cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
cc_library(cl_context SRCS cl_context.cc DEPS cl_engine) cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context) cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor) cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine)
cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine) cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image)
cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image) lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper)
lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper) add_dependencies(cl_tool opencl_clhpp)
add_dependencies(cl_tool opencl_clhpp)
endif()
...@@ -49,12 +49,12 @@ bool InitOpenCLEngine(std::string cl_path) { ...@@ -49,12 +49,12 @@ bool InitOpenCLEngine(std::string cl_path) {
return engine->IsInitSuccess(); return engine->IsInitSuccess();
} }
void elementwise_add(CLContext* context, float* in, const DDim& in_dim, void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
float* bias, const DDim& bias_dim, float* out, const float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim) { const DDim& out_dim) {
CLHelper helper(context); CLHelper helper(context);
helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
auto kernel = helper.KernelAt(0); auto kernel = helper.GetKernel(0);
CLImage in_image; CLImage in_image;
in_image.set_tensor_data(in, in_dim); in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(helper.OpenCLContext()); in_image.InitNormalCLImage(helper.OpenCLContext());
......
...@@ -22,8 +22,13 @@ namespace paddle { ...@@ -22,8 +22,13 @@ namespace paddle {
namespace lite { namespace lite {
bool InitOpenCLEngine(std::string cl_path); bool InitOpenCLEngine(std::string cl_path);
void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
float* bias, const DDim& bias_dim, float* out, /// An elementwise_add method to embed OpenCL logic inside, it is used as a
/// black box so that the framework can remain simple.
/// NOTE Currently, these methods are quite expensive, we will optimize them
/// latter.
void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
const float* bias, const DDim& bias_dim, float* out,
const DDim& out_dim); const DDim& out_dim);
} // namespace lite } // namespace lite
......
...@@ -29,17 +29,18 @@ void CLHelper::AddKernel(const std::string &kernel_name, ...@@ -29,17 +29,18 @@ void CLHelper::AddKernel(const std::string &kernel_name,
CHECK(context_ != nullptr) << "Please use set_context first!"; CHECK(context_ != nullptr) << "Please use set_context first!";
VLOG(3) << " --- begin to add kernel ---"; VLOG(3) << " --- begin to add kernel ---";
auto kernel = context_->GetKernel(kernel_name, file_name, options); auto kernel = context_->GetKernel(kernel_name, file_name, options);
kernels.emplace_back(std::move(kernel)); kernels_.emplace_back(std::move(kernel));
kernel_offset_[kernel_name] = kernels_.size() - 1;
VLOG(3) << " --- end to add kernel --- "; VLOG(3) << " --- end to add kernel --- ";
} }
cl::Kernel &CLHelper::KernelAt(const int index) { cl::Kernel &CLHelper::GetKernel(const int index) {
VLOG(3) << " --- kernel count: " << kernels.size() << " --- "; VLOG(3) << " --- kernel count: " << kernels_.size() << " --- ";
CHECK(static_cast<size_t>(index) < kernels.size()) CHECK(static_cast<size_t>(index) < kernels_.size())
<< "The index must be less than the size of kernels."; << "The index must be less than the size of kernels.";
CHECK(kernels[index] != nullptr) CHECK(kernels_[index] != nullptr)
<< "The target kernel pointer cannot be null."; << "The target kernel pointer cannot be null.";
return *(kernels[index]); return *(kernels_[index]);
} }
cl::CommandQueue &CLHelper::OpenCLCommandQueue() { cl::CommandQueue &CLHelper::OpenCLCommandQueue() {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -35,7 +36,12 @@ class CLHelper { ...@@ -35,7 +36,12 @@ class CLHelper {
void AddKernel(const std::string &kernel_name, const std::string &file_name, void AddKernel(const std::string &kernel_name, const std::string &file_name,
const std::string &options = ""); const std::string &options = "");
cl::Kernel &KernelAt(const int index); cl::Kernel &GetKernel(const int index);
cl::Kernel &GetKernel(const std::string &name) {
auto it = kernel_offset_.find(name);
CHECK(it != kernel_offset_.end());
return GetKernel(it->second);
}
cl::CommandQueue &OpenCLCommandQueue(); cl::CommandQueue &OpenCLCommandQueue();
...@@ -45,7 +51,8 @@ class CLHelper { ...@@ -45,7 +51,8 @@ class CLHelper {
private: private:
CLContext *context_{nullptr}; CLContext *context_{nullptr};
std::vector<std::unique_ptr<cl::Kernel>> kernels; std::map<std::string, int> kernel_offset_;
std::vector<std::unique_ptr<cl::Kernel>> kernels_;
}; };
} // namespace lite } // namespace lite
......
...@@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { ...@@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
return os; return os;
} }
void CLImage::set_tensor_data(float* tensor_data, const DDim& dim) { void CLImage::set_tensor_data(const float* tensor_data, const DDim& dim) {
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
auto numel = dim.product(); auto numel = dim.product();
#else #else
......
...@@ -33,7 +33,7 @@ class CLImage { ...@@ -33,7 +33,7 @@ class CLImage {
/* /*
* Will not hold input tensor data, memcpy in this method. * Will not hold input tensor data, memcpy in this method.
* */ * */
void set_tensor_data(float* tensor_data, const DDim& dim); void set_tensor_data(const float* tensor_data, const DDim& dim);
bool IsInit() { return initialized_; } bool IsInit() { return initialized_; }
/* /*
......
...@@ -65,7 +65,7 @@ TEST(cl_test, kernel_test) { ...@@ -65,7 +65,7 @@ TEST(cl_test, kernel_test) {
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
helper->AddKernel("pool_max", "pool_kernel.cl"); helper->AddKernel("pool_max", "pool_kernel.cl");
helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
auto kernel = helper->KernelAt(2); auto kernel = helper->GetKernel(2);
std::unique_ptr<float[]> in_data(new float[1024 * 512]); std::unique_ptr<float[]> in_data(new float[1024 * 512]);
for (int i = 0; i < 1024 * 512; i++) { for (int i = 0; i < 1024 * 512; i++) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册