diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 385921f704cf48c6c6a463c6800b4ec992f73084..64592d73e1741c2bc93a2c90b58b1824b2c887f9 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -904,6 +904,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } +std::vector* OperatorWithKernel::GetKernelConfig( + const OpKernelType& key) const { + auto config_iter = kernel_configs_map_.find(key); + std::vector* kernel_configs = nullptr; + if (config_iter != kernel_configs_map_.end()) { + kernel_configs = &(config_iter->second); + } + return kernel_configs; +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { RuntimeContext ctx(Inputs(), Outputs(), scope); @@ -940,11 +950,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } - auto config_iter = kernel_configs_map_.find(expected_kernel_key); - std::vector* kernel_configs = nullptr; - if (config_iter != kernel_configs_map_.end()) { - kernel_configs = &(config_iter->second); - } + std::vector* kernel_configs = + GetKernelConfig(expected_kernel_key); // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 8109739caeffe1e50e9c6a8261550fae4c16a7fc..8a86813e9362d7b82c2023428a35a1982adb0508 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" @@ -184,98 +185,6 @@ class OperatorBase { const platform::Place& place) const = 0; }; -template -class AlgorithmsCache { - public: - AlgorithmsCache() : search_times_(0) { hash_.clear(); } - // Caches the best algorithm for a given - // combination of tensor dimensions & compute data type. - TAlgorithm GetAlgorithm( - const std::vector& dims1, const std::vector& dims2, - const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, - int algorithmFlags, // can set for different data type - std::function gen_func); - - TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags, - std::function gen_func); - - private: - std::unordered_map hash_; - int search_times_; -}; - -template -TAlgorithm framework::AlgorithmsCache::GetAlgorithm( - const std::vector& dims1, const std::vector& dims2, - const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, int algorithmFlags, - std::function gen_func) { - int64_t seed = 0; - // Hash all of the inputs, use to try and look up a previously - // discovered algorithm, or fall back to generating a new one. - std::hash hashFn; - // do hash like boost - // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x - for (const auto num : dims1) { - seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - - for (const auto num : dims2) { - seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1; - } - - for (const auto num : strides) { - seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + - (seed >> 2) + 2; - } - - for (const auto num : paddings) { - seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + - (seed >> 2) + 3; - } - - for (const auto num : dilations) { - seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + - (seed >> 2) + 4; - } - - seed ^= hashFn(static_cast(algorithmFlags)) + 0x9e3779b9 + - (seed << 6) + (seed >> 2) + 5; - - if (seed == 0) return gen_func(); - - if (hash_.find(seed) == hash_.end()) { - TAlgorithm value = gen_func(); - hash_[seed] = value; - } - return hash_[seed]; -} - -template -TAlgorithm AlgorithmsCache::GetAlgorithm( - int64_t area, int search_times, int algorithmFlags, - std::function gen_func) { - if (hash_.find(area) != hash_.end()) { - return hash_[area]; - } - if (search_times_ < search_times) { - auto algo = gen_func(); - hash_[area] = algo; - ++search_times_; - return algo; - } - TAlgorithm algo; - int64_t min = static_cast(INT_MAX); - for (const auto& m : hash_) { - if (m.first < min) { - min = m.first; - algo = m.second; - } - } - return algo; -} - #ifdef PADDLE_WITH_CUDA using KernelConfig = boost::variant< std::shared_ptr>, @@ -602,6 +511,8 @@ class OperatorWithKernel : public OperatorBase { virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + std::vector* GetKernelConfig(const OpKernelType& key) const; + protected: virtual OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h new file mode 100644 index 0000000000000000000000000000000000000000..c520c222350ceeef246dae756a7157872ae087fa --- /dev/null +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace framework { + +// Not thread-safe. Should be owned per-kernel. +template +class AlgorithmsCache { + public: + AlgorithmsCache() : search_times_(0) { hash_.clear(); } + // Caches the best algorithm for a given + // combination of tensor dimensions & compute data type. + TAlgorithm GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, + int algorithmFlags, // can set for different data type + std::function gen_func); + + TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags, + std::function gen_func); + + private: + std::unordered_map hash_; + int search_times_; +}; + +template +TAlgorithm framework::AlgorithmsCache::GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, int algorithmFlags, + std::function gen_func) { + int64_t seed = 0; + // Hash all of the inputs, use to try and look up a previously + // discovered algorithm, or fall back to generating a new one. + std::hash hashFn; + // do hash like boost + // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x + for (const auto num : dims1) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + + for (const auto num : dims2) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1; + } + + for (const auto num : strides) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 2; + } + + for (const auto num : paddings) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 3; + } + + for (const auto num : dilations) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 4; + } + + seed ^= hashFn(static_cast(algorithmFlags)) + 0x9e3779b9 + + (seed << 6) + (seed >> 2) + 5; + + if (seed == 0) return gen_func(); + + if (hash_.find(seed) == hash_.end()) { + TAlgorithm value = gen_func(); + hash_[seed] = value; + } + return hash_[seed]; +} + +template +TAlgorithm AlgorithmsCache::GetAlgorithm( + int64_t area, int search_times, int algorithmFlags, + std::function gen_func) { + if (hash_.find(area) != hash_.end()) { + return hash_[area]; + } + if (search_times_ < search_times) { + auto algo = gen_func(); + hash_[area] = algo; + ++search_times_; + return algo; + } + TAlgorithm algo; + int64_t min = static_cast(INT_MAX); + for (const auto& m : hash_) { + if (m.first < min) { + min = m.first; + algo = m.second; + } + } + return algo; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 2dbc1b0f9690587868d0a0e8602a0d6332e2806b..8c91f867814c064b6c9f148666eda68d2d7a0793 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -44,8 +44,13 @@ class PreparedOp { PreparedOp(const framework::OperatorBase& op, const framework::RuntimeContext& ctx, framework::OperatorWithKernel::OpKernelFunc func, - platform::DeviceContext* dev_ctx) - : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {} + platform::DeviceContext* dev_ctx, + std::vector* kernel_configs) + : op(op), + ctx(ctx), + func(func), + dev_ctx(dev_ctx), + kernel_configs(kernel_configs) {} static PreparedOp Prepare(const framework::RuntimeContext& ctx, const framework::OperatorWithKernel& op, @@ -84,7 +89,9 @@ class PreparedOp { PADDLE_THROW("op %s does not have kernel for %s", op.Type(), KernelTypeToString(expected_kernel_key)); } - return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); + std::vector* kernel_configs = + op.GetKernelConfig(expected_kernel_key); + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs); } inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; } @@ -93,6 +100,7 @@ class PreparedOp { const framework::RuntimeContext& ctx; framework::OperatorWithKernel::OpKernelFunc func; platform::DeviceContext* dev_ctx; + std::vector* kernel_configs; }; class OpBase; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 1982fdb1c79b1eb1547835d1cfaac64c2f7fb5ac..a77c842bd8958ba55f0927b3dc2999be9bb34ba5 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -138,8 +138,9 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, op->place_ = GetExpectedPlace(expected_place, inputs); PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); - prepared_op.func(framework::ExecutionContext( - prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx, nullptr)); + prepared_op.func( + framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, + prepared_op.ctx, prepared_op.kernel_configs)); if (!stop_gradient) { std::unique_ptr> grad_to_var( diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 705ce41a3ff869d1ac1bfe89790d55e964940db2..64152829b4f000e545054e528edca33dfe96ec56 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -154,8 +154,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0, search_func); } else { - // Cache searched algo in Var(kCUDNNFwdAlgoCache). - // all conv ops use the same kCUDNNFwdAlgoCache variable. algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings, dilations, 0, search_func); }