From 0f03e23bc8cf636d15e8bcd730a62abb154e1a7a Mon Sep 17 00:00:00 2001 From: Chunwei Date: Mon, 24 Jun 2019 03:19:46 +0000 Subject: [PATCH] framework support cl --- paddle/fluid/framework/ir/graph_viz_pass.cc | 4 +- paddle/fluid/inference/analysis/dot.h | 8 ++- paddle/fluid/lite/CMakeLists.txt | 8 ++- paddle/fluid/lite/api/CMakeLists.txt | 9 +-- paddle/fluid/lite/api/cxx_api.cc | 3 - paddle/fluid/lite/api/mobilenetv1_test.cc | 30 ++++++-- paddle/fluid/lite/core/context.cc | 4 ++ paddle/fluid/lite/core/context.h | 69 ++++++++++++++++--- .../core/mir/fusion/quant_dequant_op_fuser.cc | 21 +++--- .../lite/core/mir/graph_visualize_pass.cc | 3 +- paddle/fluid/lite/core/mir/pattern_matcher.cc | 2 +- paddle/fluid/lite/core/mir/pattern_matcher.h | 3 +- .../core/mir/type_target_transform_pass.cc | 3 +- paddle/fluid/lite/core/op_lite.cc | 2 + paddle/fluid/lite/core/op_registry.cc | 7 ++ paddle/fluid/lite/core/op_registry.h | 4 ++ paddle/fluid/lite/core/optimizer.h | 17 +++-- paddle/fluid/lite/core/program.h | 3 +- paddle/fluid/lite/core/target_wrapper.h | 9 +-- paddle/fluid/lite/kernels/CMakeLists.txt | 2 +- .../fluid/lite/kernels/opencl/CMakeLists.txt | 16 +++++ .../kernels/opencl/elementwise_add_compute.cc | 53 ++++++++++++++ .../opencl/elementwise_add_compute_test.cc | 66 ++++++++++++++++++ paddle/fluid/lite/kernels/use_kernels.h | 3 + paddle/fluid/lite/opencl/CMakeLists.txt | 28 ++++---- paddle/fluid/lite/opencl/cl_caller.cc | 6 +- paddle/fluid/lite/opencl/cl_caller.h | 9 ++- paddle/fluid/lite/opencl/cl_helper.cc | 13 ++-- paddle/fluid/lite/opencl/cl_helper.h | 11 ++- paddle/fluid/lite/opencl/cl_image.cc | 2 +- paddle/fluid/lite/opencl/cl_image.h | 2 +- paddle/fluid/lite/opencl/cl_test.cc | 2 +- 32 files changed, 340 insertions(+), 82 deletions(-) create mode 100644 paddle/fluid/lite/kernels/opencl/CMakeLists.txt create mode 100644 paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc create mode 100644 paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index f4df4cfeb..3722aaab1 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/inference/analysis/dot.h" +#include "paddle/fluid/lite/utils/string.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -84,7 +85,8 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const { auto marked_nodes = ConsumeMarkedNodes(graph); // Create nodes for (const Node* n : graph->Nodes()) { - std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")"; + std::string node_id = + lite::string_format("%s(%d)", FormatName(n).c_str(), n->id()); if (n->IsOp()) { decltype(op_attrs) attr = marked_nodes.count(n) ? marked_op_attrs : op_attrs; diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h index 1cb790f18..5790d55d7 100644 --- a/paddle/fluid/inference/analysis/dot.h +++ b/paddle/fluid/inference/analysis/dot.h @@ -58,9 +58,11 @@ class Dot { std::vector attrs; Node(const std::string& name, const std::vector& attrs) - : name(name), - attrs(attrs), - id_("node_" + std::to_string(dot_node_counter++)) {} + : name(name), attrs(attrs) { + std::stringstream ss; + ss << "node_" << dot_node_counter++; + id_ = ss.str(); + } std::string id() const { return id_; } diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt index dfd85d00a..9da0973c6 100644 --- a/paddle/fluid/lite/CMakeLists.txt +++ b/paddle/fluid/lite/CMakeLists.txt @@ -37,7 +37,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -78,6 +78,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_OPENCL) + foreach(var ${lite_deps_CL_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + set(${TARGET} ${deps} PARENT_SCOPE) endfunction() diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt index 84da4757b..d4094b4dc 100644 --- a/paddle/fluid/lite/api/CMakeLists.txt +++ b/paddle/fluid/lite/api/CMakeLists.txt @@ -52,22 +52,23 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels}) lite_cc_test(test_mobilenetv1_lite SRCS mobilenetv1_test.cc - DEPS ${lite_model_test_DEPS} + DEPS ${lite_model_test_DEPS} + CL_DEPS ${opencl_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL) add_dependencies(test_mobilenetv1_lite extern_lite_download_mobilenet_v1_tar_gz) lite_cc_test(test_mobilenetv2_lite SRCS mobilenetv2_test.cc - DEPS ${lite_model_test_DEPS} + DEPS ${lite_model_test_DEPS} ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2 SERIAL) add_dependencies(test_mobilenetv2_lite extern_lite_download_mobilenet_v2_tar_gz) lite_cc_test(test_resnet50_lite SRCS resnet50_test.cc - DEPS ${lite_model_test_DEPS} + DEPS ${lite_model_test_DEPS} ARGS --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL) add_dependencies(test_resnet50_lite extern_lite_download_resnet50_tar_gz) lite_cc_test(test_inceptionv4_lite SRCS inceptionv4_test.cc - DEPS ${lite_model_test_DEPS} + DEPS ${lite_model_test_DEPS} ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) add_dependencies(test_inceptionv4_lite extern_lite_download_inception_v4_tar_gz) endif() diff --git a/paddle/fluid/lite/api/cxx_api.cc b/paddle/fluid/lite/api/cxx_api.cc index e86ddf04d..4db6f99ae 100644 --- a/paddle/fluid/lite/api/cxx_api.cc +++ b/paddle/fluid/lite/api/cxx_api.cc @@ -23,10 +23,7 @@ namespace paddle { namespace lite { void Predictor::SaveModel(const std::string &dir) { -#ifndef LITE_WITH_ARM MkDirRecur(dir); -#else -#endif program_->PersistModel(dir, program_desc_); LOG(INFO) << "Save model to " << dir; } diff --git a/paddle/fluid/lite/api/mobilenetv1_test.cc b/paddle/fluid/lite/api/mobilenetv1_test.cc index 9b7d6dc40..d72fc7690 100644 --- a/paddle/fluid/lite/api/mobilenetv1_test.cc +++ b/paddle/fluid/lite/api/mobilenetv1_test.cc @@ -25,16 +25,13 @@ namespace paddle { namespace lite { -#ifdef LITE_WITH_ARM -TEST(MobileNetV1, test) { +void TestModel(const std::vector& valid_places, + const Place& preferred_place) { DeviceInfo::Init(); DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads); lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}}); - predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)}, - valid_places); + predictor.Build(FLAGS_model_dir, preferred_place, valid_places); auto* input_tensor = predictor.GetInput(0); input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); @@ -70,7 +67,26 @@ TEST(MobileNetV1, test) { ASSERT_EQ(out->dims()[0], 1); ASSERT_EQ(out->dims()[1], 1000); } -#endif + +TEST(MobileNetV1, test_arm) { + std::vector valid_places({ + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + // Place{TARGET(kOpenCL), PRECISION(kFloat)}, + }); + + TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); +} + +TEST(MobileNetV1, test_opencl) { + std::vector valid_places({ + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kOpenCL), PRECISION(kFloat)}, + }); + + TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)})); +} } // namespace lite } // namespace paddle diff --git a/paddle/fluid/lite/core/context.cc b/paddle/fluid/lite/core/context.cc index aae36b4a2..97d98a357 100644 --- a/paddle/fluid/lite/core/context.cc +++ b/paddle/fluid/lite/core/context.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/lite/core/context.h" +#ifdef LITE_WITH_OPENCL +DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); +#endif + namespace paddle { namespace lite {} // namespace lite } // namespace paddle diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h index a79ce04fa..55268c4b1 100644 --- a/paddle/fluid/lite/core/context.h +++ b/paddle/fluid/lite/core/context.h @@ -23,6 +23,11 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device_context.h" #endif +#ifdef LITE_WITH_OPENCL +#include "paddle/fluid/lite/opencl/cl_context.h" +#include "paddle/fluid/lite/opencl/cl_engine.h" +#include "paddle/fluid/lite/opencl/cl_helper.h" +#endif #include #include #include @@ -34,6 +39,10 @@ #include "paddle/fluid/lite/core/target_wrapper.h" #include "paddle/fluid/lite/utils/all.h" +#ifdef LITE_WITH_OPENCL +DECLARE_string(cl_path); +#endif + namespace paddle { namespace lite { @@ -44,6 +53,7 @@ using HostContext = Context; using X86Context = Context; using CUDAContext = Context; using ARMContext = Context; +using OpenClContext = Context; template <> class Context { @@ -51,7 +61,7 @@ class Context { // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} - void CopyShared(const HostContext* ctx) {} + void CopySharedTo(const HostContext* ctx) {} std::string name() const { return "HostContext"; } }; @@ -69,7 +79,7 @@ class Context { // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() { DeviceInfo::Init(); } - void CopyShared(const ARMContext* ctx) {} + void CopySharedTo(const ARMContext* ctx) {} void SetRunMode(PowerMode mode, int threads) { return DeviceInfo::Global().SetRunMode(mode, threads); @@ -109,7 +119,7 @@ class Context { cublas_fp32_ = std::make_shared>(); } - void CopyShared(const CUDAContext* ctx) { + void CopySharedTo(const CUDAContext* ctx) { CHECK(ctx); CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; ctx->cublas_fp32_ = cublas_fp32_; @@ -175,7 +185,7 @@ class Context { // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} - void CopyShared(const X86Context* ctx) {} + void CopySharedTo(const X86Context* ctx) {} const device_ctx_t* x86_device_context() { return x86_device_context_.get(); } void SetX86DeviceContext(std::unique_ptr&& ctx) { @@ -202,6 +212,40 @@ class Context { }; #endif +#ifdef LITE_WITH_OPENCL +template <> +class Context { + mutable std::shared_ptr cl_context_; + mutable std::shared_ptr cl_helper_; + + public: + CLContext* cl_context() { return cl_context_.get(); } + CLHelper* cl_helper() { return cl_helper_.get(); } + + void InitOnce() { + // Init cl engine. + CHECK(CLEngine::Global()->IsInitSuccess()) << "OpenCL engine init failed"; + CLEngine::Global()->set_cl_path(FLAGS_cl_path); + + cl_context_ = std::make_shared(); + cl_helper_ = std::make_shared(); + cl_helper_->set_context(cl_context_.get()); + + PrepareKernels(); + } + + void CopySharedTo(const OpenClContext* ctx) { + ctx->cl_context_ = cl_context_; + } + + private: + void PrepareKernels() { + cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + cl_helper_->AddKernel("pool_max", "pool_kernel.cl"); + } +}; +#endif + // Context for running a kernel. // Holds the necessary resource and information. class KernelContext { @@ -230,26 +274,32 @@ class ContextScheduler { std::unique_ptr ctx(new KernelContext); switch (target) { case TARGET(kHost): - kernel_contexts_[TargetType::kHost].As().CopyShared( + kernel_contexts_[TargetType::kHost].As().CopySharedTo( &ctx->As()); break; #ifdef LITE_WITH_X86 case TARGET(kX86): - kernel_contexts_[TargetType::kX86].As().CopyShared( + kernel_contexts_[TargetType::kX86].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_CUDA case TARGET(kCUDA): - kernel_contexts_[TargetType::kCUDA].As().CopyShared( + kernel_contexts_[TargetType::kCUDA].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_ARM case TARGET(kARM): - kernel_contexts_[TargetType::kARM].As().CopyShared( + kernel_contexts_[TargetType::kARM].As().CopySharedTo( &ctx->As()); break; +#endif +#ifdef LITE_WITH_OPENCL + case TARGET(kOpenCL): + kernel_contexts_[TargetType::kOpenCL].As().CopySharedTo( + &ctx->As()); + break; #endif default: LOG(FATAL) << "unsupported target " << TargetToStr(target); @@ -273,6 +323,9 @@ class ContextScheduler { #endif #ifdef LITE_WITH_ARM InitContext(); +#endif +#ifdef LITE_WITH_OPENCL + InitContext(); #endif } diff --git a/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc index 59325ed9d..0c4eb0334 100644 --- a/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h" #include #include +#include "paddle/fluid/lite/utils/string.h" namespace paddle { namespace lite { @@ -57,24 +58,24 @@ void QuantDequantOpFuser::BuildPattern() { ->AsIntermediate(); std::vector nodes; for (int i = 0; i < times_; i++) { - nodes.push_back(VarNode("quantized_op_weight" + std::to_string(i)) + nodes.push_back(VarNode(string_format("quantized_op_weight%d", i)) ->assert_is_op_input(op_type_, weight_name) ->AsInput()); - nodes.push_back(OpNode("quantized_op" + std::to_string(i), op_type_) + nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_) ->assert_is_op(op_type_) ->AsIntermediate()); - nodes.push_back(VarNode("quantized_op_out" + std::to_string(i)) + nodes.push_back(VarNode(string_format("quantized_op_out%d", i)) ->assert_is_op_output(op_type_) ->assert_is_op_input("fake_dequantize_max_abs", "X") ->AsIntermediate()); nodes.push_back( - OpNode("dequant_op" + std::to_string(i), "fake_dequantize_max_abs") + OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs") ->assert_is_op("fake_dequantize_max_abs") ->AsIntermediate()); - nodes.push_back(VarNode("dequant_op_out" + std::to_string(i)) + nodes.push_back(VarNode(string_format("dequant_op_out%d", i)) ->assert_is_op_output("fake_dequantize_max_abs", "Out") ->AsOutput()); } @@ -108,11 +109,11 @@ void QuantDequantOpFuser::InsertNewNode(SSAGraph* graph, std::vector nodes; for (int i = 0; i < times_; i++) { - nodes.push_back(matched.at("quantized_op_weight" + std::to_string(i))); - nodes.push_back(matched.at("quantized_op" + std::to_string(i))); - nodes.push_back(matched.at("quantized_op_out" + std::to_string(i))); - nodes.push_back(matched.at("dequant_op" + std::to_string(i))); - nodes.push_back(matched.at("dequant_op_out" + std::to_string(i))); + nodes.push_back(matched.at(string_format("quantized_op_weight%d", i))); + nodes.push_back(matched.at(string_format("quantized_op%d", i))); + nodes.push_back(matched.at(string_format("quantized_op_out%d", i))); + nodes.push_back(matched.at(string_format("dequant_op%d", i))); + nodes.push_back(matched.at(string_format("dequant_op_out%d", i))); } int bit_length = quant_op->stmt()->op_info()->GetAttr("bit_length"); auto* scope = quant_op->stmt()->op()->scope(); diff --git a/paddle/fluid/lite/core/mir/graph_visualize_pass.cc b/paddle/fluid/lite/core/mir/graph_visualize_pass.cc index 90a99b5de..141c1af8e 100644 --- a/paddle/fluid/lite/core/mir/graph_visualize_pass.cc +++ b/paddle/fluid/lite/core/mir/graph_visualize_pass.cc @@ -17,6 +17,7 @@ #include #include #include "paddle/fluid/lite/core/mir/pass_registry.h" +#include "paddle/fluid/lite/utils/string.h" namespace paddle { namespace lite { @@ -39,7 +40,7 @@ std::string Visualize(mir::SSAGraph* graph) { if (node.IsArg()) { key = node.AsArg().name; } else { - key = node.AsStmt().op_type() + std::to_string(id++); + key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++); } if (node.IsStmt()) { diff --git a/paddle/fluid/lite/core/mir/pattern_matcher.cc b/paddle/fluid/lite/core/mir/pattern_matcher.cc index 3cda96c30..6250ee8ee 100644 --- a/paddle/fluid/lite/core/mir/pattern_matcher.cc +++ b/paddle/fluid/lite/core/mir/pattern_matcher.cc @@ -325,7 +325,7 @@ std::string PMPattern::DotString() const { // Create Nodes std::unordered_map node2dot; for (const auto &node : nodes()) { - std::string node_id = "Node" + std::to_string(id++); + std::string node_id = string_format("Node%d", id++); dot.AddNode(node_id, {}, node->name()); node2dot[node.get()] = node_id; } diff --git a/paddle/fluid/lite/core/mir/pattern_matcher.h b/paddle/fluid/lite/core/mir/pattern_matcher.h index 76ed5f1dd..adbe7efd1 100644 --- a/paddle/fluid/lite/core/mir/pattern_matcher.h +++ b/paddle/fluid/lite/core/mir/pattern_matcher.h @@ -30,6 +30,7 @@ #include "paddle/fluid/lite/core/mir/node.h" #include "paddle/fluid/lite/core/mir/ssa_graph.h" #include "paddle/fluid/lite/model_parser/pb/op_desc.h" +#include "paddle/fluid/lite/utils/string.h" namespace paddle { namespace lite { @@ -228,7 +229,7 @@ class PMPattern { FRIEND_TEST(PMPattern, NewNode); #endif - static std::string NewID() { return "pmnode-" + std::to_string(id_++); } + static std::string NewID() { return string_format("pmnode-%d", id_++); } std::vector> nodes_; std::vector edges_; diff --git a/paddle/fluid/lite/core/mir/type_target_transform_pass.cc b/paddle/fluid/lite/core/mir/type_target_transform_pass.cc index 951e3423e..93ac94b38 100644 --- a/paddle/fluid/lite/core/mir/type_target_transform_pass.cc +++ b/paddle/fluid/lite/core/mir/type_target_transform_pass.cc @@ -20,6 +20,7 @@ #include #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h" #include "paddle/fluid/lite/core/mir/pass_registry.h" +#include "paddle/fluid/lite/utils/string.h" namespace paddle { namespace lite { @@ -80,7 +81,7 @@ void TypeTargetTransformPass::AddIoCopyInst( CHECK(in->IsArg()); auto node_id = [&] { return graph->nodes().size(); }; auto io_copy_output_name = - in->AsArg().name + "/trans/" + std::to_string(node_id()); + string_format("%s/trans/%d", in->AsArg().name.c_str(), node_id()); auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name); auto* io_copy_inst = graph->NewInstructNode(); diff --git a/paddle/fluid/lite/core/op_lite.cc b/paddle/fluid/lite/core/op_lite.cc index 31c339a5e..bbcf3d309 100644 --- a/paddle/fluid/lite/core/op_lite.cc +++ b/paddle/fluid/lite/core/op_lite.cc @@ -30,6 +30,8 @@ std::vector> OpLite::CreateKernels( auto pick_kernel = [&](const Place &place) { auto ks = KernelRegistry::Global().Create(op_type_, place.target, place.precision, place.layout); + VLOG(5) << "pick kernel for " << op_info()->Type() << " " << place + << " get " << ks.size() << " kernels"; for (auto &&it : ks) { AttachKernel(it.get()); kernels.emplace_back(std::move(it)); diff --git a/paddle/fluid/lite/core/op_registry.cc b/paddle/fluid/lite/core/op_registry.cc index 131bdc641..8d91072b0 100644 --- a/paddle/fluid/lite/core/op_registry.cc +++ b/paddle/fluid/lite/core/op_registry.cc @@ -62,6 +62,9 @@ std::list> KernelRegistry::Create( case TARGET(kARM): { CREATE_KERNEL(kARM); } break; + case TARGET(kOpenCL): { + CREATE_KERNEL(kOpenCL); + } break; default: CHECK(false) << "not supported kernel target " << TargetToStr(target); } @@ -99,6 +102,10 @@ KernelRegistry::KernelRegistry() INIT_FOR(kARM, kInt8, kNCHW); INIT_FOR(kARM, kAny, kNCHW); INIT_FOR(kARM, kAny, kAny); + + INIT_FOR(kOpenCL, kFloat, kNCHW); + INIT_FOR(kOpenCL, kAny, kNCHW); + INIT_FOR(kOpenCL, kAny, kAny); #undef INIT_FOR } diff --git a/paddle/fluid/lite/core/op_registry.h b/paddle/fluid/lite/core/op_registry.h index fc4cd25fa..8f670faa0 100644 --- a/paddle/fluid/lite/core/op_registry.h +++ b/paddle/fluid/lite/core/op_registry.h @@ -82,6 +82,10 @@ class KernelRegistry final { KernelRegistryForTarget *, // KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget * // >; diff --git a/paddle/fluid/lite/core/optimizer.h b/paddle/fluid/lite/core/optimizer.h index c42699ff1..34666cc54 100644 --- a/paddle/fluid/lite/core/optimizer.h +++ b/paddle/fluid/lite/core/optimizer.h @@ -50,13 +50,22 @@ class Optimizer { if (passes.empty()) { RunPasses(std::vector{{ - "lite_quant_dequant_fuse_pass", // - "lite_conv_bn_fuse_pass", // + "lite_quant_dequant_fuse_pass", // + "lite_conv_bn_fuse_pass", // +// This pass is disabled to force some opencl kernels selected for final +// running, otherwise, they will be fused to ARM fusion kernels, and the OpenCL +// devices will be discarded. +// TODO(Superjomn) Refine the fusion related design to select fusion kernels for +// devices automatically. +#ifndef LITE_WITH_OPENCL "lite_conv_elementwise_add_activation_fuse_pass", // - "lite_fc_fuse_pass", // - "identity_scale_eliminate_pass", // +#endif + "lite_fc_fuse_pass", // + "identity_scale_eliminate_pass", // #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +#ifndef LITE_WITH_OPENCL "lite_elementwise_add_activation_fuse_pass", // +#endif #endif "static_kernel_pick_pass", // "variable_place_inference_pass", // diff --git a/paddle/fluid/lite/core/program.h b/paddle/fluid/lite/core/program.h index 5aef3c280..6c997e579 100644 --- a/paddle/fluid/lite/core/program.h +++ b/paddle/fluid/lite/core/program.h @@ -140,7 +140,8 @@ class RuntimeProgram { void Run() { for (auto& inst : instructions_) { - VLOG(3) << ">> Running kernel: " << inst.op()->op_info()->Repr(); + VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr() + << " on Target " << TargetToStr(inst.kernel()->target()); inst.Run(); } } diff --git a/paddle/fluid/lite/core/target_wrapper.h b/paddle/fluid/lite/core/target_wrapper.h index 66fbc6522..1d395b569 100644 --- a/paddle/fluid/lite/core/target_wrapper.h +++ b/paddle/fluid/lite/core/target_wrapper.h @@ -31,6 +31,7 @@ enum class TargetType : int { kX86, kCUDA, kARM, + kOpenCL, kAny, // any target NUM, // number of fields. }; @@ -69,8 +70,8 @@ static size_t PrecisionTypeLength(PrecisionType type) { #define DATALAYOUT(item__) paddle::lite::DataLayoutType::item__ static const std::string& TargetToStr(TargetType target) { - static const std::string target2string[] = {"unk", "host", "x86", - "cuda", "arm", "any"}; + static const std::string target2string[] = {"unk", "host", "x86", "cuda", + "arm", "opencl", "any"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -92,8 +93,8 @@ static const std::string& DataLayoutToStr(DataLayoutType layout) { } static const std::string& TargetRepr(TargetType target) { - static const std::string target2string[] = {"kUnk", "kHost", "kX86", "kCUDA", - "kAny"}; + static const std::string target2string[] = { + "kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; diff --git a/paddle/fluid/lite/kernels/CMakeLists.txt b/paddle/fluid/lite/kernels/CMakeLists.txt index 0d2178382..1fad136cb 100644 --- a/paddle/fluid/lite/kernels/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/CMakeLists.txt @@ -4,5 +4,5 @@ add_subdirectory(host) add_subdirectory(arm) add_subdirectory(cuda) add_subdirectory(x86) - +add_subdirectory(opencl) diff --git a/paddle/fluid/lite/kernels/opencl/CMakeLists.txt b/paddle/fluid/lite/kernels/opencl/CMakeLists.txt new file mode 100644 index 000000000..179882628 --- /dev/null +++ b/paddle/fluid/lite/kernels/opencl/CMakeLists.txt @@ -0,0 +1,16 @@ +if (NOT LITE_WITH_OPENCL) + return () +endif() + +set(cl_kernel_deps op_params_lite cl_caller cl_engine cl_context cl_wrapper) + +cc_library(elementwise_add_opencl SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps}) + +lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc DEPS elementwise_add_opencl + op_registry_lite program_lite + context_lite + ) + +set(opencl_kernels + elementwise_add_opencl + CACHE INTERNAL "") diff --git a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc new file mode 100644 index 000000000..c5230cbaf --- /dev/null +++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/operators/op_params.h" +// NOTE ugly here, hide these. +#include "paddle/fluid/lite/opencl/cl_caller.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +class ElementwiseAddCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto& context = ctx_->As(); + CHECK(context.cl_context()); + + elementwise_add( + context.cl_context(), static_cast(param.X->raw_data()), + param.X->dims(), static_cast(param.Y->raw_data()), + param.Y->dims(), param.Out->mutable_data(), param.Out->dims()); + } +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, + paddle::lite::kernels::opencl::ElementwiseAddCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc new file mode 100644 index 000000000..9f382244a --- /dev/null +++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { + +TEST(elementwise_add, init) { + LOG(INFO) << "to get kernel ..."; + auto kernels = KernelRegistry::Global().Create( + "elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); + ASSERT_FALSE(kernels.empty()); + + auto kernel = std::move(kernels.front()); + + LOG(INFO) << "get kernel"; + + lite::Tensor X, Y, Out; + operators::ElementwiseParam param; + param.X = &X; + param.Y = &Y; + param.Out = &Out; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + kernel->SetParam(param); + kernel->SetContext(std::move(context)); + + X.Resize({1, 10}); + Y.Resize({1, 10}); + Out.Resize({1, 10}); + + auto* x_data = X.mutable_data(); + auto* y_data = Y.mutable_data(); + auto* out_data = Out.mutable_data(); + + for (int i = 0; i < 10; i++) { + x_data[i] = 1.1 * i; + y_data[i] = 2.3 * i; + } + + kernel->Launch(); + + for (int i = 0; i < 10; i++) { + EXPECT_NEAR(out_data[i], 3.4 * i, 1e-1); + } +} + +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/use_kernels.h b/paddle/fluid/lite/kernels/use_kernels.h index 3e32f05da..df39ab98e 100644 --- a/paddle/fluid/lite/kernels/use_kernels.h +++ b/paddle/fluid/lite/kernels/use_kernels.h @@ -61,3 +61,6 @@ USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def); USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host); #endif +#ifdef LITE_WITH_OPENCL +USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def); +#endif diff --git a/paddle/fluid/lite/opencl/CMakeLists.txt b/paddle/fluid/lite/opencl/CMakeLists.txt index b0d20bef8..39282aeff 100644 --- a/paddle/fluid/lite/opencl/CMakeLists.txt +++ b/paddle/fluid/lite/opencl/CMakeLists.txt @@ -2,18 +2,16 @@ if (NOT LITE_WITH_OPENCL) return() endif() -if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - cc_library(cl_wrapper SRCS cl_wrapper.cc) - cc_library(cl_tool SRCS cl_tool.cc) - target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers) - cc_library(cl_half SRCS cl_half.cc) - target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing) - cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool) - cc_library(cl_context SRCS cl_context.cc DEPS cl_engine) - cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context) - cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor) - cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine) - cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image) - lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper) - add_dependencies(cl_tool opencl_clhpp) -endif() +cc_library(cl_wrapper SRCS cl_wrapper.cc) +cc_library(cl_tool SRCS cl_tool.cc) +target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers) +cc_library(cl_half SRCS cl_half.cc) +target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing) +cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool) +cc_library(cl_context SRCS cl_context.cc DEPS cl_engine) +cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context) +cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor) +cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine) +cc_library(cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image) +lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper) +add_dependencies(cl_tool opencl_clhpp) diff --git a/paddle/fluid/lite/opencl/cl_caller.cc b/paddle/fluid/lite/opencl/cl_caller.cc index fbb970dea..6db41aadb 100644 --- a/paddle/fluid/lite/opencl/cl_caller.cc +++ b/paddle/fluid/lite/opencl/cl_caller.cc @@ -49,12 +49,12 @@ bool InitOpenCLEngine(std::string cl_path) { return engine->IsInitSuccess(); } -void elementwise_add(CLContext* context, float* in, const DDim& in_dim, - float* bias, const DDim& bias_dim, float* out, +void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, + const float* bias, const DDim& bias_dim, float* out, const DDim& out_dim) { CLHelper helper(context); helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); - auto kernel = helper.KernelAt(0); + auto kernel = helper.GetKernel(0); CLImage in_image; in_image.set_tensor_data(in, in_dim); in_image.InitNormalCLImage(helper.OpenCLContext()); diff --git a/paddle/fluid/lite/opencl/cl_caller.h b/paddle/fluid/lite/opencl/cl_caller.h index ed9b879fa..221c52f65 100644 --- a/paddle/fluid/lite/opencl/cl_caller.h +++ b/paddle/fluid/lite/opencl/cl_caller.h @@ -22,8 +22,13 @@ namespace paddle { namespace lite { bool InitOpenCLEngine(std::string cl_path); -void elementwise_add(CLContext* context, float* in, const DDim& in_dim, - float* bias, const DDim& bias_dim, float* out, + +/// An elementwise_add method to embed OpenCL logic inside, it is used as a +/// black box so that the framework can remain simple. +/// NOTE Currently, these methods are quite expensive, we will optimize them +/// latter. +void elementwise_add(CLContext* context, const float* in, const DDim& in_dim, + const float* bias, const DDim& bias_dim, float* out, const DDim& out_dim); } // namespace lite diff --git a/paddle/fluid/lite/opencl/cl_helper.cc b/paddle/fluid/lite/opencl/cl_helper.cc index 116828c15..7e8ffa623 100644 --- a/paddle/fluid/lite/opencl/cl_helper.cc +++ b/paddle/fluid/lite/opencl/cl_helper.cc @@ -29,17 +29,18 @@ void CLHelper::AddKernel(const std::string &kernel_name, CHECK(context_ != nullptr) << "Please use set_context first!"; VLOG(3) << " --- begin to add kernel ---"; auto kernel = context_->GetKernel(kernel_name, file_name, options); - kernels.emplace_back(std::move(kernel)); + kernels_.emplace_back(std::move(kernel)); + kernel_offset_[kernel_name] = kernels_.size() - 1; VLOG(3) << " --- end to add kernel --- "; } -cl::Kernel &CLHelper::KernelAt(const int index) { - VLOG(3) << " --- kernel count: " << kernels.size() << " --- "; - CHECK(static_cast(index) < kernels.size()) +cl::Kernel &CLHelper::GetKernel(const int index) { + VLOG(3) << " --- kernel count: " << kernels_.size() << " --- "; + CHECK(static_cast(index) < kernels_.size()) << "The index must be less than the size of kernels."; - CHECK(kernels[index] != nullptr) + CHECK(kernels_[index] != nullptr) << "The target kernel pointer cannot be null."; - return *(kernels[index]); + return *(kernels_[index]); } cl::CommandQueue &CLHelper::OpenCLCommandQueue() { diff --git a/paddle/fluid/lite/opencl/cl_helper.h b/paddle/fluid/lite/opencl/cl_helper.h index f6f89fb6f..6f9a28ced 100644 --- a/paddle/fluid/lite/opencl/cl_helper.h +++ b/paddle/fluid/lite/opencl/cl_helper.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -35,7 +36,12 @@ class CLHelper { void AddKernel(const std::string &kernel_name, const std::string &file_name, const std::string &options = ""); - cl::Kernel &KernelAt(const int index); + cl::Kernel &GetKernel(const int index); + cl::Kernel &GetKernel(const std::string &name) { + auto it = kernel_offset_.find(name); + CHECK(it != kernel_offset_.end()); + return GetKernel(it->second); + } cl::CommandQueue &OpenCLCommandQueue(); @@ -45,7 +51,8 @@ class CLHelper { private: CLContext *context_{nullptr}; - std::vector> kernels; + std::map kernel_offset_; + std::vector> kernels_; }; } // namespace lite diff --git a/paddle/fluid/lite/opencl/cl_image.cc b/paddle/fluid/lite/opencl/cl_image.cc index 2c551d6cd..04dfc07b9 100644 --- a/paddle/fluid/lite/opencl/cl_image.cc +++ b/paddle/fluid/lite/opencl/cl_image.cc @@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { return os; } -void CLImage::set_tensor_data(float* tensor_data, const DDim& dim) { +void CLImage::set_tensor_data(const float* tensor_data, const DDim& dim) { #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK auto numel = dim.product(); #else diff --git a/paddle/fluid/lite/opencl/cl_image.h b/paddle/fluid/lite/opencl/cl_image.h index 627e50316..006bc10c4 100644 --- a/paddle/fluid/lite/opencl/cl_image.h +++ b/paddle/fluid/lite/opencl/cl_image.h @@ -33,7 +33,7 @@ class CLImage { /* * Will not hold input tensor data, memcpy in this method. * */ - void set_tensor_data(float* tensor_data, const DDim& dim); + void set_tensor_data(const float* tensor_data, const DDim& dim); bool IsInit() { return initialized_; } /* diff --git a/paddle/fluid/lite/opencl/cl_test.cc b/paddle/fluid/lite/opencl/cl_test.cc index 4a4ac965c..39392a8f1 100644 --- a/paddle/fluid/lite/opencl/cl_test.cc +++ b/paddle/fluid/lite/opencl/cl_test.cc @@ -65,7 +65,7 @@ TEST(cl_test, kernel_test) { helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); helper->AddKernel("pool_max", "pool_kernel.cl"); helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl"); - auto kernel = helper->KernelAt(2); + auto kernel = helper->GetKernel(2); std::unique_ptr in_data(new float[1024 * 512]); for (int i = 0; i < 1024 * 512; i++) { -- GitLab