提交 9cf7ddc8 编写于 作者: J jingqinghe
...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
if(NOT LITE_GIT_TAG) if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa) set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
endif() endif()
if(NOT CUDA_ARCH_NAME) if(NOT CUDA_ARCH_NAME)
......
...@@ -385,7 +385,11 @@ function(cc_test_run TARGET_NAME) ...@@ -385,7 +385,11 @@ function(cc_test_run TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 2 minutes. # No unit test should exceed 2 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120) if (APPLE OR WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
else()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif()
endif() endif()
endfunction() endfunction()
...@@ -742,9 +746,14 @@ function(py_test TARGET_NAME) ...@@ -742,9 +746,14 @@ function(py_test TARGET_NAME)
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
if (APPLE OR WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
else()
# No unit test should exceed 2 minutes in Linux.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif()
# No unit test should exceed 2 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif() endif()
endfunction() endfunction()
......
...@@ -189,7 +189,7 @@ copy(inference_lib_dist ...@@ -189,7 +189,7 @@ copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal) DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/) DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
......
...@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() { ...@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
C.PD_EnableMkldnnQuantizer(config.c) C.PD_EnableMkldnnQuantizer(config.c)
} }
func (config *AnalysisConfig) EnableMkldnnBfloat16() {
C.PD_EnableMkldnnBfloat16(config.c)
}
func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool { func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c)) return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
} }
func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
}
// SetModelBuffer // SetModelBuffer
// ModelFromMemory // ModelFromMemory
......
...@@ -119,7 +119,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_ ...@@ -119,7 +119,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context) device_context)
......
...@@ -1170,6 +1170,21 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id, ...@@ -1170,6 +1170,21 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
#endif #endif
} }
void FleetWrapper::LoadWithWhitelist(const uint64_t table_id,
const std::string& path, const int mode) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->load_with_whitelist(table_id, path,
std::to_string(mode));
ret.wait();
if (ret.get() != 0) {
LOG(ERROR) << "load model of table id: " << table_id
<< ", from path: " << path << " failed";
}
#else
VLOG(0) << "FleetWrapper::LoadWhitelist does nothing when no pslib";
#endif
}
void FleetWrapper::SaveModel(const std::string& path, const int mode) { void FleetWrapper::SaveModel(const std::string& path, const int mode) {
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode)); auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode));
...@@ -1285,6 +1300,26 @@ int32_t FleetWrapper::SaveCache(int table_id, const std::string& path, ...@@ -1285,6 +1300,26 @@ int32_t FleetWrapper::SaveCache(int table_id, const std::string& path,
#endif #endif
} }
int32_t FleetWrapper::SaveWithWhitelist(int table_id, const std::string& path,
const int mode,
const std::string& whitelist_path) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->save_with_whitelist(
table_id, path, std::to_string(mode), whitelist_path);
ret.wait();
int32_t feasign_cnt = ret.get();
if (feasign_cnt == -1) {
LOG(ERROR) << "table save cache failed";
sleep(sleep_seconds_before_fail_exit_);
exit(-1);
}
return feasign_cnt;
#else
VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib";
return -1;
#endif
}
void FleetWrapper::ShrinkSparseTable(int table_id) { void FleetWrapper::ShrinkSparseTable(int table_id) {
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->shrink(table_id); auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
......
...@@ -273,6 +273,11 @@ class FleetWrapper { ...@@ -273,6 +273,11 @@ class FleetWrapper {
// save cache model // save cache model
// cache model can speed up online predict // cache model can speed up online predict
int32_t SaveCache(int table_id, const std::string& path, const int mode); int32_t SaveCache(int table_id, const std::string& path, const int mode);
// save sparse table filtered by user-defined whitelist
int32_t SaveWithWhitelist(int table_id, const std::string& path,
const int mode, const std::string& whitelist_path);
void LoadWithWhitelist(const uint64_t table_id, const std::string& path,
const int mode);
// copy feasign key/value from src_table_id to dest_table_id // copy feasign key/value from src_table_id to dest_table_id
int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id); int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id);
// copy feasign key/value from src_table_id to dest_table_id // copy feasign key/value from src_table_id to dest_table_id
......
...@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP }; ...@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP };
class GlooWrapper { class GlooWrapper {
public: public:
static std::shared_ptr<GlooWrapper> GetInstance() {
static auto s_instance = std::make_shared<GlooWrapper>();
return s_instance;
}
GlooWrapper() {} GlooWrapper() {}
virtual ~GlooWrapper() {} virtual ~GlooWrapper() {}
...@@ -153,6 +158,11 @@ class GlooWrapper { ...@@ -153,6 +158,11 @@ class GlooWrapper {
#endif #endif
} }
bool IsInitialized() { return is_initialized_; }
#ifdef PADDLE_WITH_GLOO
std::shared_ptr<gloo::Context> GetContext() { return context_; }
#endif
template <typename T> template <typename T>
std::vector<T> AllReduce(std::vector<T>& sendbuf, // NOLINT std::vector<T> AllReduce(std::vector<T>& sendbuf, // NOLINT
const std::string& mode = "sum") { // NOLINT const std::string& mode = "sum") { // NOLINT
......
...@@ -115,6 +115,7 @@ message VarType { ...@@ -115,6 +115,7 @@ message VarType {
SIZE_T = 19; SIZE_T = 19;
UINT8 = 20; UINT8 = 20;
INT8 = 21; INT8 = 21;
BF16 = 22;
// Other types that may need additional descriptions // Other types that may need additional descriptions
LOD_TENSOR = 7; LOD_TENSOR = 7;
......
...@@ -12,67 +12,122 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,67 +12,122 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/generator.h"
#include <glog/logging.h>
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/generator.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::shared_ptr<Generator> Generator::gen_instance_ = NULL; const std::shared_ptr<Generator>& DefaultCPUGenerator() {
static auto default_cpu_generator =
std::make_shared<Generator>(GetRandomSeed());
VLOG(4) << "initial seed: " << default_cpu_generator->GetCurrentSeed()
<< ", cpu engine: " << default_cpu_generator->GetCPUEngine().get();
return default_cpu_generator;
}
std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine() {
static auto op_default_cpu_engine = std::make_shared<std::mt19937_64>();
return op_default_cpu_engine;
}
// NOTE(zhiqiu): there are 3 conditions:
// (1) op seed is not set and DefaultCPUGenerator is inited, use
// DefaultCPUGenerator
// (2) op seed is not set and DefaultCPUGenerator is not inited, use se
// OpDefaultCPUEngine() and set a radnom seed
// (3) op seed is set, use OpDefaultCPUEngine() and set the seed
std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
if (DefaultCPUGenerator()->GetIsInitPy() && seed == 0) {
VLOG(4) << "Use random engine from generator";
return DefaultCPUGenerator()->GetCPUEngine();
} else {
// NOTE(zhiqiu): creating an engine instance everytime instead of using
// OpDefaultCPUEngine(), this is the legacy behavior of random operators.
// The benefit is that when runing PE with fixed-seed in multiple thrads,
// each thread has their own engine, and doesn't affect each other.
//
// And we need to measure the determinacy of Generator in PE.
auto engine = std::make_shared<std::mt19937_64>();
if (seed == 0) {
seed = GetRandomSeed();
VLOG(4) << "Use default random engine with random seed = " << seed;
} else {
VLOG(4) << "Use default random engine with fixed random seed = " << seed;
}
static std::mutex mu_;
{
std::lock_guard<std::mutex> lock(mu_);
engine->seed(seed);
}
return engine;
}
}
GeneratorState* Generator::GetState() { GeneratorState Generator::GetState() {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
return this->state_.get(); state_.cpu_engine = *engine_;
return this->state_;
} }
void Generator::SetState(GeneratorState* state_in) { void Generator::SetState(const GeneratorState& state) {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
*this->state_ = *state_in; this->state_ = state;
this->engine_ = std::make_shared<std::mt19937_64>(state.cpu_engine);
} }
uint64_t Generator::GetCurrentSeed() { uint64_t Generator::GetCurrentSeed() {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
return this->state_->current_seed; return this->state_.current_seed;
} }
uint64_t Generator::Seed() { uint64_t Generator::Seed() {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
uint64_t seed; uint64_t seed;
std::random_device de; std::random_device de;
seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF; seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
this->state_->current_seed = seed; this->state_.current_seed = seed;
std::seed_seq seq({seed}); std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq); this->engine_->seed(seq);
return this->state_->current_seed; return this->state_.current_seed;
} }
void Generator::SetCurrentSeed(uint64_t seed) { void Generator::SetCurrentSeed(uint64_t seed) {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
this->state_->current_seed = uint64_t(seed); this->state_.current_seed = seed;
std::seed_seq seq({seed}); std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq); this->engine_->seed(seq);
} }
std::mt19937_64& Generator::GetCPUEngine() { std::shared_ptr<std::mt19937_64> Generator::GetCPUEngine() {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
return this->state_->cpu_engine; return this->engine_;
} }
void Generator::SetCPUEngine(std::mt19937_64 engine) { void Generator::SetCPUEngine(std::shared_ptr<std::mt19937_64> engine) {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
this->state_->cpu_engine = std::mt19937_64(engine); this->engine_ = engine;
} }
uint64_t Generator::Random64() { uint64_t Generator::Random64() {
std::lock_guard<std::mutex> lock(this->mutex); std::lock_guard<std::mutex> lock(this->mu_);
return this->state_->cpu_engine(); auto engine = this->engine_;
return (*engine)();
}
void Generator::SetIsInitPy(bool is_init_py) {
this->is_init_py_ = is_init_py;
VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
} }
bool Generator::GetIsInitPy() const { return this->is_init_py_; }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <glog/logging.h>
#include <stdint.h> #include <stdint.h>
#include <atomic> #include <atomic>
#include <deque> #include <deque>
#include <iostream> // temp for debug #include <iostream> // temp for debug
...@@ -27,6 +29,12 @@ limitations under the License. */ ...@@ -27,6 +29,12 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static uint64_t GetRandomSeed() {
std::random_device rd;
// double has 53 bit significant, so limit uint64 to 53 bits
return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
}
struct GeneratorState { struct GeneratorState {
int64_t device = -1; int64_t device = -1;
uint64_t current_seed = 34342423252; uint64_t current_seed = 34342423252;
...@@ -35,62 +43,67 @@ struct GeneratorState { ...@@ -35,62 +43,67 @@ struct GeneratorState {
struct Generator { struct Generator {
Generator() { Generator() {
GeneratorState default_gen_state_cpu; auto seed = GetRandomSeed();
default_gen_state_cpu.device = -1; std::seed_seq seq({seed});
default_gen_state_cpu.current_seed = 34342423252; auto engine = std::make_shared<std::mt19937_64>(seq);
std::seed_seq seq({34342423252}); this->state_.cpu_engine = *engine;
default_gen_state_cpu.cpu_engine = std::mt19937_64(seq); this->state_.device = -1;
this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu); this->state_.current_seed = seed;
this->engine_ = engine;
VLOG(4) << "initial seed: " << this->state_.current_seed
<< ", cpu engine: " << &this->state_.cpu_engine;
}
explicit Generator(uint64_t seed) {
std::seed_seq seq({seed});
auto engine = std::make_shared<std::mt19937_64>(seq);
this->state_.cpu_engine = *engine;
this->state_.device = -1;
this->state_.current_seed = seed;
this->engine_ = engine;
VLOG(4) << "initial seed: " << this->state_.current_seed
<< ", cpu engine: " << &this->state_.cpu_engine;
this->is_init_py_ = true; // TODO(zhiqiu): remove it in future
} }
explicit Generator(GeneratorState state_in) Generator(const Generator& other) = delete;
: state_{std::make_shared<GeneratorState>(state_in)} {}
Generator(const Generator& other)
: Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
// get random state // get random state
GeneratorState* GetState(); GeneratorState GetState();
// set random state // set random state
void SetState(GeneratorState* state_in); void SetState(const GeneratorState&);
// get current seed // get current seed
uint64_t GetCurrentSeed(); uint64_t GetCurrentSeed();
// random a seed and get // random a seed and get
uint64_t Seed(); uint64_t Seed();
// set seed // set seed
void SetCurrentSeed(uint64_t seed); void SetCurrentSeed(uint64_t seed);
// get cpu engine // get cpu engine
std::mt19937_64& GetCPUEngine(); std::shared_ptr<std::mt19937_64> GetCPUEngine();
// set cpu engine // set cpu engine
void SetCPUEngine(std::mt19937_64 engine); void SetCPUEngine(std::shared_ptr<std::mt19937_64>);
uint64_t Random64(); uint64_t Random64();
bool is_init_py = false; void SetIsInitPy(bool);
bool GetIsInitPy() const;
// CPU Generator singleton private:
static std::shared_ptr<Generator> GetInstance() { GeneratorState state_;
if (NULL == gen_instance_) { std::shared_ptr<std::mt19937_64> engine_;
gen_instance_.reset(new paddle::framework::Generator()); mutable std::mutex mu_;
}
return gen_instance_; // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
} // old seed, and it should be removed after all random-related operators
// and unittests upgrades to use generator.
bool is_init_py_ = false;
};
static std::shared_ptr<Generator> GetInstanceX() { // The DefaultCPUGenerator is used in manual_seed()
if (NULL == gen_instance_) { const std::shared_ptr<Generator>& DefaultCPUGenerator();
gen_instance_.reset(new paddle::framework::Generator());
}
gen_instance_->is_init_py = true;
return gen_instance_;
}
private: // If op seed is set or global is not set, the OpDefaultCPUEngine is used.
static std::shared_ptr<Generator> gen_instance_; std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
std::shared_ptr<GeneratorState> state_;
mutable std::mutex mutex;
Generator(const Generator& other, const std::lock_guard<std::mutex>&) std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
: state_(std::make_shared<GeneratorState>(*(other.state_))) {}
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
#include <cmath>
#include <functional> #include <functional>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -74,12 +75,17 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, ...@@ -74,12 +75,17 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>(); auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
auto weights_shape = weights->dims(); auto weights_shape = weights->dims();
auto weights_shape_2d = flatten_to_2d(weights_shape, 1); auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
EigenMatrixArrayMap weights_array_2d( EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0], weights_shape_2d[1]);
weights_shape_2d[1]);
weights_array_2d.colwise() *= scale_array; weights_array_2d.colwise() *= scale_array;
// Check for subnormal values that slows down convolution execution
for (int i = 0; i < weights->numel(); ++i) {
if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
}
} }
void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
...@@ -108,13 +114,6 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -108,13 +114,6 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
GET_CONV_BN_NODES(conv_ac_pattern); GET_CONV_BN_NODES(conv_ac_pattern);
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+affinechannel fuse";
return;
}
// Create eltwise_y (conv bias) variable // Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc( VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in")); patterns::PDNodeName(name_scope_, "eltwise_y_in"));
...@@ -143,6 +142,7 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -143,6 +142,7 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()})); desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
desc.SetType("elementwise_add"); desc.SetType("elementwise_add");
desc.SetAttr("axis", 1); desc.SetAttr("axis", 1);
desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
......
...@@ -1879,6 +1879,19 @@ PDNode *patterns::MultipleQuantize::operator()() { ...@@ -1879,6 +1879,19 @@ PDNode *patterns::MultipleQuantize::operator()() {
return prev_out; return prev_out;
} }
PDNode *patterns::QuantizePlacement::operator()(
const std::unordered_set<std::string> &quantize_enabled_op_types) {
std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
"fc", "matmul", "pool2d", "prior_box",
"relu", "reshape2", "transpose2"});
if (!quantize_enabled_op_types.empty()) {
supported_op_types = quantize_enabled_op_types;
}
auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
return op;
}
PDNode *patterns::MKLDNNInPlace::operator()() { PDNode *patterns::MKLDNNInPlace::operator()() {
const std::unordered_set<std::string> &supported_op_types = { const std::unordered_set<std::string> &supported_op_types = {
"abs", "abs",
......
...@@ -1120,6 +1120,15 @@ struct MultipleQuantize : public PatternBase { ...@@ -1120,6 +1120,15 @@ struct MultipleQuantize : public PatternBase {
PATTERN_DECL_NODE(prev_out); PATTERN_DECL_NODE(prev_out);
}; };
struct QuantizePlacement : public PatternBase {
QuantizePlacement(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "quantize_placement") {}
PDNode* operator()(
const std::unordered_set<std::string>& quantize_enabled_op_types);
PATTERN_DECL_NODE(op);
};
// Pattern used for enforcing inplace computation for in-place computation // Pattern used for enforcing inplace computation for in-place computation
// supporting DNNL ops. softmax, batch_norm and layer_norm // supporting DNNL ops. softmax, batch_norm and layer_norm
struct MKLDNNInPlace : public PatternBase { struct MKLDNNInPlace : public PatternBase {
......
...@@ -26,30 +26,33 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { ...@@ -26,30 +26,33 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
Get<std::unordered_set<int>>("quantize_excluded_op_ids"); Get<std::unordered_set<int>>("quantize_excluded_op_ids");
const auto& op_types_list = const auto& op_types_list =
Get<std::unordered_set<std::string>>("quantize_enabled_op_types"); Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
for (const Node* n : graph->Nodes()) { Init(name_scope_, graph);
if (n->IsOp()) { GraphPatternDetector gpd;
if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(), patterns::QuantizePlacement quantize_placement_pattern{gpd.mutable_pattern(),
n->id()) != excluded_ids_list.end()) "quantize_placement"};
continue; quantize_placement_pattern(op_types_list);
auto* op = n->Op();
if (op->HasAttr("mkldnn_data_type") || auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
op->HasProtoAttr("mkldnn_data_type")) { Graph* g) {
// use_quantizer is no longer used GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern);
// assign value for compatibility
if (op->GetAttrIfExists<bool>("use_quantizer")) { if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
op->SetAttr("mkldnn_data_type", std::string("int8")); op->id()) != excluded_ids_list.end()) {
} return;
if (op_types_list.empty()) { }
op->SetAttr("mkldnn_data_type", std::string("int8"));
op->SetAttr("use_quantizer", true); if (op->Op()->HasAttr("mkldnn_data_type") ||
} else if (std::find(op_types_list.begin(), op_types_list.end(), op->Op()->HasProtoAttr("mkldnn_data_type")) {
op->Type()) != op_types_list.end()) { // use_quantizer is no longer used
op->SetAttr("mkldnn_data_type", std::string("int8")); // assign value for compatibility
op->SetAttr("use_quantizer", true); if (op->Op()->GetAttrIfExists<bool>("use_quantizer")) {
} op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
} }
op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
op->Op()->SetAttr("use_quantizer", true);
} }
} };
gpd(graph, handler);
} }
} // namespace ir } // namespace ir
......
...@@ -15,7 +15,10 @@ limitations under the License. */ ...@@ -15,7 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include "paddle/fluid/framework/ir/pass.h" #include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -23,9 +26,10 @@ namespace ir { ...@@ -23,9 +26,10 @@ namespace ir {
/* /*
* Specifies which operators should be quantized. * Specifies which operators should be quantized.
*/ */
class CPUQuantizePlacementPass : public Pass { class CPUQuantizePlacementPass : public FusePassBase {
protected: protected:
void ApplyImpl(ir::Graph* graph) const override; void ApplyImpl(ir::Graph* graph) const override;
const std::string name_scope_{"cpu_quantize_placement_pass"};
}; };
} // namespace ir } // namespace ir
......
...@@ -130,7 +130,7 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) { ...@@ -130,7 +130,7 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
MainTest({"conv2d"}, {4}, 1); MainTest({"conv2d"}, {4}, 1);
} }
TEST(QuantizerPlacementPass, excluded_none) { TEST(QuantizerPlacementPass, empty_list) {
// all operators quantized // all operators quantized
MainTest({}, {}, 6); MainTest({}, {}, 6);
} }
......
...@@ -81,7 +81,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope, ...@@ -81,7 +81,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
if (quantized_op_type == "conv2d" || if (quantized_op_type == "conv2d" ||
quantized_op_type == "conv2d_fusion" || quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "depthwise_conv2d" || quantized_op_type == "depthwise_conv2d" ||
quantized_op_type == "fc") { quantized_op_type == "fc" ||
quantized_op_type == "conv2d_transpose") {
op_desc->SetAttr("Input_scale", scale_value); op_desc->SetAttr("Input_scale", scale_value);
} else if (quantized_op_type == "mul") { } else if (quantized_op_type == "mul") {
op_desc->SetAttr("X_scale", scale_value); op_desc->SetAttr("X_scale", scale_value);
...@@ -111,7 +112,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -111,7 +112,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
std::string input_name = ""; std::string input_name = "";
if (quantized_op_type == "conv2d" || if (quantized_op_type == "conv2d" ||
quantized_op_type == "depthwise_conv2d" || quantized_op_type == "depthwise_conv2d" ||
quantized_op_type == "conv2d_fusion") { quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "conv2d_transpose") {
weight_name = "Filter"; weight_name = "Filter";
input_name = "Input"; input_name = "Input";
} else if (quantized_op_type == "mul") { } else if (quantized_op_type == "mul") {
...@@ -122,7 +124,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -122,7 +124,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
input_name = "Input"; input_name = "Input";
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for " "QuantDequantFuse: We only support conv2d, conv2d_fusion, "
"conv2d_transpose, fc, mul for "
"now.")); "now."));
} }
const std::string pattern_name = "dequant_fuse"; const std::string pattern_name = "dequant_fuse";
...@@ -192,10 +195,12 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -192,10 +195,12 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>(); scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
auto w_dims = weight_tensor->dims(); auto w_dims = weight_tensor->dims();
// If quantized op is fc, weight scale size = 1; // If quantized op is fc, weight scale size = 1;
// If quantized op is conv, weight scale size = weight dims[0] // If quantized op is conv2d, weight scale size = weight dims[0]
// If quantized op is conv2d_transpose, weight scale size = weight dims[1]
bool valid_scale_size = bool valid_scale_size =
(weight_scale.size() == 1 || (weight_scale.size() == 1 ||
weight_scale.size() == static_cast<size_t>(w_dims[0])); weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
weight_scale.size() == static_cast<size_t>(w_dims[1]));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
valid_scale_size, true, valid_scale_size, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -206,8 +211,14 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -206,8 +211,14 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
if (weight_scale.size() == 1) { if (weight_scale.size() == 1) {
quantized_weight_data[j] *= weight_scale[0]; quantized_weight_data[j] *= weight_scale[0];
} else { } else {
int inner_size = w_dims[1] * w_dims[2] * w_dims[3]; if (quantized_op_type == "conv2d_transpose") {
quantized_weight_data[j] *= weight_scale[j / inner_size]; int inner_size = w_dims[2] * w_dims[3];
quantized_weight_data[j] *=
weight_scale[(j / inner_size) % w_dims[1]];
} else {
int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
quantized_weight_data[j] *= weight_scale[j / inner_size];
}
} }
} }
...@@ -220,7 +231,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope, ...@@ -220,7 +231,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
new_op_desc.SetType(quantized_op_type); new_op_desc.SetType(quantized_op_type);
new_op_desc.SetAttr("enable_int8", true); new_op_desc.SetAttr("enable_int8", true);
if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" || if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "depthwise_conv2d") { quantized_op_type == "depthwise_conv2d" ||
quantized_op_type == "conv2d_transpose") {
new_op_desc.SetInput("Input", {new_input}); new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Output", {new_output}); new_op_desc.SetOutput("Output", {new_output});
} else if (quantized_op_type == "fc") { } else if (quantized_op_type == "fc") {
...@@ -253,7 +265,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -253,7 +265,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
std::unordered_set<std::string> quant_types = { std::unordered_set<std::string> quant_types = {
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
std::unordered_set<std::string> quantized_op_types = { std::unordered_set<std::string> quantized_op_types = {
"conv2d", "mul", "depthwise_conv2d", "fc"}; "conv2d", "mul", "depthwise_conv2d", "fc", "conv2d_transpose"};
auto* scope = param_scope(); auto* scope = param_scope();
for (auto& quant_type : quant_types) { for (auto& quant_type : quant_types) {
......
...@@ -34,7 +34,8 @@ struct OpUpdateRecord { ...@@ -34,7 +34,8 @@ struct OpUpdateRecord {
kModifyAttr, kModifyAttr,
kNewAttr, kNewAttr,
kNewInput, kNewInput,
kNewOutput kNewOutput,
kBugfixWithBehaviorChanged,
}; };
Type type_; Type type_;
std::string remark_; std::string remark_;
...@@ -82,6 +83,11 @@ struct NewOutput : OpUpdateRecord { ...@@ -82,6 +83,11 @@ struct NewOutput : OpUpdateRecord {
std::string name_; std::string name_;
}; };
struct BugfixWithBehaviorChanged : OpUpdateRecord {
explicit BugfixWithBehaviorChanged(const std::string& remark)
: OpUpdateRecord({Type::kBugfixWithBehaviorChanged, remark}) {}
};
class OpVersionDesc { class OpVersionDesc {
public: public:
OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark, OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
...@@ -110,6 +116,12 @@ class OpVersionDesc { ...@@ -110,6 +116,12 @@ class OpVersionDesc {
return *this; return *this;
} }
OpVersionDesc& BugfixWithBehaviorChanged(const std::string& remark) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::BugfixWithBehaviorChanged(remark)));
return *this;
}
private: private:
std::vector<std::shared_ptr<OpUpdateRecord>> infos_; std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
}; };
......
...@@ -23,6 +23,10 @@ namespace compatible { ...@@ -23,6 +23,10 @@ namespace compatible {
TEST(test_operator_version, test_operator_version) { TEST(test_operator_version, test_operator_version) {
REGISTER_OP_VERSION(test__) REGISTER_OP_VERSION(test__)
.AddCheckpoint(
R"ROC(Fix the bug of reshape op, support the case of axis < 0)ROC",
framework::compatible::OpVersionDesc().BugfixWithBehaviorChanged(
"Support the case of axis < 0"))
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
Upgrade reshape, modified one attribute [axis] and add a new attribute [size]. Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
......
...@@ -913,10 +913,20 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) { ...@@ -913,10 +913,20 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
auto element_num = tensor.numel(); auto element_num = tensor.numel();
os << " - data: ["; os << " - data: [";
if (element_num > 0) { // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
os << inspect[0]; if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
for (int j = 1; j < element_num; ++j) { if (element_num > 0) {
os << " " << inspect[j]; os << signed(inspect[0]);
for (int j = 1; j < element_num; ++j) {
os << " " << signed(inspect[j]);
}
}
} else {
if (element_num > 0) {
os << inspect[0];
for (int j = 1; j < element_num; ++j) {
os << " " << inspect[j];
}
} }
} }
os << "]"; os << "]";
......
...@@ -30,12 +30,12 @@ ...@@ -30,12 +30,12 @@
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(sort_sum_gradient);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy, void BasicEngine::Init(VarBase* var, bool retain_graph) {
bool retain_graph) {
backward_strategy_ = strategy;
retain_graph_ = retain_graph; retain_graph_ = retain_graph;
init_node_ = var->GradVarBase()->GradNode(); init_node_ = var->GradVarBase()->GradNode();
var->GradVarBase()->ClearGradNode(); var->GradVarBase()->ClearGradNode();
...@@ -105,7 +105,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { ...@@ -105,7 +105,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
auto& accumulator = accumulators_[var.get()]; auto& accumulator = accumulators_[var.get()];
if (!accumulator) { if (!accumulator) {
if (backward_strategy_.sorted_sum_gradient_) { if (FLAGS_sort_sum_gradient) {
accumulator.reset(new SortedGradientAccumulator(var.get())); accumulator.reset(new SortedGradientAccumulator(var.get()));
} else { } else {
accumulator.reset(new EagerGradientAccumulator(var.get())); accumulator.reset(new EagerGradientAccumulator(var.get()));
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/imperative/gradient_accumulator.h"
...@@ -30,8 +29,7 @@ class OpBase; ...@@ -30,8 +29,7 @@ class OpBase;
class BasicEngine : public Engine { class BasicEngine : public Engine {
public: public:
void Init(VarBase* var, const detail::BackwardStrategy& strategy, void Init(VarBase* var, bool retain_graph = false);
bool retain_graph = false);
void Execute() override; void Execute() override;
...@@ -46,7 +44,6 @@ class BasicEngine : public Engine { ...@@ -46,7 +44,6 @@ class BasicEngine : public Engine {
private: private:
std::shared_ptr<GradOpNode> init_node_; std::shared_ptr<GradOpNode> init_node_;
detail::BackwardStrategy backward_strategy_;
std::unordered_map<GradOpNode*, size_t> node_deps_; std::unordered_map<GradOpNode*, size_t> node_deps_;
std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>> std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
accumulators_; accumulators_;
......
...@@ -33,6 +33,8 @@ ...@@ -33,6 +33,8 @@
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
DECLARE_bool(sort_sum_gradient);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -529,8 +531,7 @@ class PartialGradTask { ...@@ -529,8 +531,7 @@ class PartialGradTask {
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const platform::Place &place, bool create_graph,
const detail::BackwardStrategy &strategy, bool create_graph,
bool retain_graph, bool allow_unused, bool only_inputs); bool retain_graph, bool allow_unused, bool only_inputs);
std::vector<std::shared_ptr<VarBase>> Run(); std::vector<std::shared_ptr<VarBase>> Run();
...@@ -577,7 +578,6 @@ class PartialGradTask { ...@@ -577,7 +578,6 @@ class PartialGradTask {
bool retain_graph_; bool retain_graph_;
bool allow_unused_; bool allow_unused_;
bool only_inputs_; bool only_inputs_;
detail::BackwardStrategy strategy_;
}; };
PartialGradTask::PartialGradTask( PartialGradTask::PartialGradTask(
...@@ -585,15 +585,14 @@ PartialGradTask::PartialGradTask( ...@@ -585,15 +585,14 @@ PartialGradTask::PartialGradTask(
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const detail::BackwardStrategy &strategy, const platform::Place &place, bool create_graph, bool retain_graph,
bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) { bool allow_unused, bool only_inputs) {
input_targets_ = input_targets; input_targets_ = input_targets;
place_ = place; place_ = place;
create_graph_ = create_graph; create_graph_ = create_graph;
retain_graph_ = retain_graph; retain_graph_ = retain_graph;
allow_unused_ = allow_unused; allow_unused_ = allow_unused;
only_inputs_ = only_inputs; only_inputs_ = only_inputs;
strategy_ = strategy;
PADDLE_ENFORCE_EQ(only_inputs_, true, PADDLE_ENFORCE_EQ(only_inputs_, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
...@@ -981,7 +980,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) { ...@@ -981,7 +980,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
if (!accumulator) { if (!accumulator) {
accumulator.reset(new GradientAccumulationInfo( accumulator.reset(new GradientAccumulationInfo(
var, strategy_.sorted_sum_gradient_, create_graph_)); var, FLAGS_sort_sum_gradient, create_graph_));
} }
accumulator->IncreaseTotalRefCnt(); accumulator->IncreaseTotalRefCnt();
...@@ -1033,11 +1032,11 @@ PartialGradEngine::PartialGradEngine( ...@@ -1033,11 +1032,11 @@ PartialGradEngine::PartialGradEngine(
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const detail::BackwardStrategy &strategy, const platform::Place &place, bool create_graph, bool retain_graph,
bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) bool allow_unused, bool only_inputs)
: task_(new PartialGradTask(input_targets, output_targets, output_grads, : task_(new PartialGradTask(input_targets, output_targets, output_grads,
no_grad_vars, place, strategy, create_graph, no_grad_vars, place, create_graph, retain_graph,
retain_graph, allow_unused, only_inputs)) {} allow_unused, only_inputs)) {}
PartialGradEngine::~PartialGradEngine() { Clear(); } PartialGradEngine::~PartialGradEngine() { Clear(); }
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine { ...@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine {
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const platform::Place &place, bool create_graph,
const detail::BackwardStrategy &strategy, bool create_graph,
bool retain_graph, bool allow_unused, bool only_inputs); bool retain_graph, bool allow_unused, bool only_inputs);
~PartialGradEngine(); ~PartialGradEngine();
......
...@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { ...@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
framework::AttributeMap reduce_attr_map; framework::AttributeMap reduce_attr_map;
tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
gpu_place, true); gpu_place, true);
detail::BackwardStrategy back_st;
imperative::BasicEngine engine; imperative::BasicEngine engine;
engine.Init(reduce_sum_out.get(), back_st); engine.Init(reduce_sum_out.get());
engine.Execute(); engine.Execute();
framework::LoDTensor rlt; framework::LoDTensor rlt;
...@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) { ...@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) {
ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
detail::BackwardStrategy back_st;
imperative::BasicEngine engine; imperative::BasicEngine engine;
engine.Init(vout.get(), back_st); engine.Init(vout.get());
engine.Execute(); engine.Execute();
// check the grad // check the grad
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
DECLARE_bool(use_mkldnn);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -47,6 +49,9 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -47,6 +49,9 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs, const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place, bool trace_backward) { const platform::Place& place, bool trace_backward) {
VLOG(1) << "Trace Op: " << type; VLOG(1) << "Trace Op: " << type;
if (FLAGS_use_mkldnn) {
attrs["use_mkldnn"] = true;
}
auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
const auto& op_info = op->Info(); const auto& op_info = op->Info();
auto* attr_checker = op_info.Checker(); auto* attr_checker = op_info.Checker();
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
...@@ -103,8 +102,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -103,8 +102,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// params_file_ fields. // params_file_ fields.
CP_MEMBER(opt_cache_dir_); CP_MEMBER(opt_cache_dir_);
prog_file_ = std::move(other.prog_file_); CP_MEMBER(prog_file_);
params_file_ = std::move(other.params_file_); CP_MEMBER(params_file_);
CP_MEMBER(use_fc_padding_); CP_MEMBER(use_fc_padding_);
// GPU related. // GPU related.
...@@ -218,6 +217,17 @@ void AnalysisConfig::EnableMkldnnQuantizer() { ...@@ -218,6 +217,17 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
Update(); Update();
} }
void AnalysisConfig::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
use_mkldnn_bfloat16_ = false;
#endif
Update();
}
MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
"MkldnnQuantizer was not enabled yet."); "MkldnnQuantizer was not enabled yet.");
...@@ -331,6 +341,12 @@ void AnalysisConfig::Update() { ...@@ -331,6 +341,12 @@ void AnalysisConfig::Update() {
#endif #endif
} }
if (use_mkldnn_bfloat16_) {
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMkldnnBfloat16();
#endif
}
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Do not optimize when mkldnn is on // Do not optimize when mkldnn is on
if (enable_memory_optim_ && !use_mkldnn_) { if (enable_memory_optim_ && !use_mkldnn_) {
...@@ -399,6 +415,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -399,6 +415,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << ";"; ss << ";";
ss << use_mkldnn_quantizer_; ss << use_mkldnn_quantizer_;
ss << use_mkldnn_bfloat16_;
ss << model_from_memory_; ss << model_from_memory_;
ss << with_profile_; ss << with_profile_;
......
...@@ -32,7 +32,6 @@ ...@@ -32,7 +32,6 @@
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
...@@ -517,6 +516,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -517,6 +516,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
// TODO(NHZlX): Should add the link to the doc of
// paddle_infer::CreatePredictor<paddle_infer::Config>
if (config.glog_info_disabled()) { if (config.glog_info_disabled()) {
FLAGS_logtostderr = 1; FLAGS_logtostderr = 1;
FLAGS_minloglevel = 2; // GLOG_ERROR FLAGS_minloglevel = 2; // GLOG_ERROR
...@@ -1058,3 +1059,122 @@ USE_TRT_CONVERTER(skip_layernorm); ...@@ -1058,3 +1059,122 @@ USE_TRT_CONVERTER(skip_layernorm);
USE_TRT_CONVERTER(slice); USE_TRT_CONVERTER(slice);
USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(scale);
#endif #endif
namespace paddle_infer {
void Tensor::Reshape(const std::vector<int> &shape) { tensor_->Reshape(shape); }
std::vector<int> Tensor::shape() const { return tensor_->shape(); }
void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
return tensor_->SetLoD(x);
}
std::vector<std::vector<size_t>> Tensor::lod() const { return tensor_->lod(); }
const std::string &Tensor::name() const { return tensor_->name(); }
DataType Tensor::type() const { return tensor_->type(); }
Predictor::Predictor(const Config &config) {
const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
// The second parameter indicates that the discard log is not printed
predictor_ = paddle::CreatePaddlePredictor<
Config, paddle::PaddleEngineKind::kAnalysis>(config);
}
std::vector<std::string> Predictor::GetInputNames() {
return predictor_->GetInputNames();
}
std::unique_ptr<Tensor> Predictor::GetInputHandle(const std::string &name) {
auto zero_copy_tensor = predictor_->GetInputTensor(name);
std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
return tensor;
}
std::vector<std::string> Predictor::GetOutputNames() {
return predictor_->GetOutputNames();
}
std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
auto zero_copy_tensor = predictor_->GetOutputTensor(name);
std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
return tensor;
}
bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
std::unique_ptr<Predictor> Predictor::Clone() {
auto analysis_pred = predictor_->Clone();
std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
return pred;
}
void Predictor::ClearIntermediateTensor() {
predictor_->ClearIntermediateTensor();
}
int GetNumBytesOfDataType(DataType dtype) {
switch (dtype) {
case DataType::FLOAT32:
return sizeof(float);
case DataType::INT64:
return sizeof(int64_t);
case DataType::INT32:
return sizeof(int32_t);
case DataType::UINT8:
return sizeof(uint8_t);
default:
assert(false);
return -1;
}
}
std::string GetVersion() { return paddle::get_version(); }
std::string UpdateDllFlag(const char *name, const char *value) {
return paddle::UpdateDllFlag(name, value);
}
} // namespace paddle_infer
namespace paddle_infer {
std::shared_ptr<Predictor> CreatePredictor(const Config &config) { // NOLINT
std::shared_ptr<Predictor> predictor(new Predictor(config));
return predictor;
}
namespace services {
PredictorPool::PredictorPool(const Config &config, size_t size) {
PADDLE_ENFORCE_GE(
size, 1UL,
paddle::platform::errors::InvalidArgument(
"The predictor pool size should be greater than 1, but it's (%d)",
size));
Config copy_config(config);
main_pred_.reset(new Predictor(config));
for (size_t i = 0; i < size - 1; i++) {
if (config.tensorrt_engine_enabled()) {
Config config_tmp(copy_config);
preds_.push_back(
std::move(std::unique_ptr<Predictor>(new Predictor(config_tmp))));
} else {
preds_.push_back(std::move(main_pred_->Clone()));
}
}
}
Predictor *PredictorPool::Retrive(size_t idx) {
PADDLE_ENFORCE_LT(
idx, preds_.size() + 1,
paddle::platform::errors::InvalidArgument(
"There are (%d) predictors in the pool, but the idx is (%d)", idx,
preds_.size() + 1));
if (idx == 0) {
return main_pred_.get();
}
return preds_[idx - 1].get();
}
} // namespace services
} // namespace paddle_infer
...@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { ...@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
config.EnableUseGpu(100, 0);
config.EnableMkldnnBfloat16();
#ifdef PADDLE_WITH_MKLDNN
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
#else
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
#endif
}
#endif
TEST(AnalysisPredictor, bf16_pass_strategy) {
std::vector<std::string> passes;
PassStrategy passStrategy(passes);
passStrategy.EnableMkldnnBfloat16();
}
} // namespace paddle } // namespace paddle
...@@ -112,6 +112,12 @@ void PaddleBuf::Free() { ...@@ -112,6 +112,12 @@ void PaddleBuf::Free() {
} }
} }
NativeConfig::NativeConfig() {
LOG(WARNING) << "The paddle::NativeConfig interface is going to be "
"deprecated in the next release, plase use the latest "
"paddle_infer::Config instead.";
}
std::string get_version() { std::string get_version() {
std::stringstream ss; std::stringstream ss;
ss << "version: " << framework::paddle_version() << "\n"; ss << "version: " << framework::paddle_version() << "\n";
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <memory>
#include <set> #include <set>
#include <sstream> #include <sstream>
#include <string> #include <string>
...@@ -25,6 +26,7 @@ limitations under the License. */ ...@@ -25,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -311,6 +313,8 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -311,6 +313,8 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) { NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
// TODO(NHZlX): Should add the link to the doc of
// paddle_infer::CreatePredictor<paddle_infer::Config>
VLOG(3) << "create NativePaddlePredictor"; VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memory // 1. GPU memory
......
...@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
void EnableMkldnnQuantizer(); void EnableMkldnnQuantizer();
///
/// \brief Turn on MKLDNN bfloat16.
///
///
void EnableMkldnnBfloat16();
///
/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
///
/// \return bool Whether to use the MKLDNN Bfloat16.
///
bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
/// ///
/// \brief A boolean state telling whether the thread local CUDA stream is /// \brief A boolean state telling whether the thread local CUDA stream is
/// enabled. /// enabled.
...@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig {
int mkldnn_cache_capacity_{0}; int mkldnn_cache_capacity_{0};
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_; std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
bool use_mkldnn_bfloat16_{false};
// If the config is already used on a predictor, it becomes invalid. // If the config is already used on a predictor, it becomes invalid.
// Any config can only be used with one predictor. // Any config can only be used with one predictor.
......
...@@ -347,6 +347,7 @@ class PD_INFER_DECL PaddlePredictor { ...@@ -347,6 +347,7 @@ class PD_INFER_DECL PaddlePredictor {
/// place of inference, etc.) /// place of inference, etc.)
/// ///
struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config { struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
NativeConfig();
/// GPU related fields. /// GPU related fields.
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
...@@ -421,7 +422,8 @@ enum class PaddleEngineKind { ...@@ -421,7 +422,8 @@ enum class PaddleEngineKind {
}; };
template <typename ConfigT, PaddleEngineKind engine> template <typename ConfigT, PaddleEngineKind engine>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
const ConfigT& config);
template <> template <>
PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
...@@ -437,6 +439,4 @@ PD_INFER_DECL std::string get_version(); ...@@ -437,6 +439,4 @@ PD_INFER_DECL std::string get_version();
PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value); PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
const std::string& config_file);
} // namespace paddle } // namespace paddle
...@@ -22,9 +22,124 @@ limitations under the License. */ ...@@ -22,9 +22,124 @@ limitations under the License. */
#pragma once #pragma once
#include <cassert> #include <cassert>
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle_analysis_config.h" // NOLINT #include "paddle_analysis_config.h" // NOLINT
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
namespace paddle_infer {
using DataType = paddle::PaddleDType;
using PlaceType = paddle::PaddlePlace;
using PrecisionType = paddle::AnalysisConfig::Precision;
using Config = paddle::AnalysisConfig;
class PD_INFER_DECL Tensor {
public:
// Can only be created by predictor->GetInputHandle(cosnt std::string& name)
// or predictor->GetOutputHandle(cosnt std::string& name)
Tensor() = delete;
explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
: tensor_(std::move(tensor)) {}
void Reshape(const std::vector<int>& shape);
template <typename T>
void CopyFromCpu(const T* data);
// should add the place
template <typename T>
T* mutable_data(PlaceType place);
template <typename T>
void CopyToCpu(T* data);
template <typename T>
T* data(PlaceType* place, int* size) const;
void SetLoD(const std::vector<std::vector<size_t>>& x);
std::vector<std::vector<size_t>> lod() const;
DataType type() const;
std::vector<int> shape() const;
const std::string& name() const;
private:
std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
};
class PD_INFER_DECL Predictor {
public:
Predictor() = default;
~Predictor() {}
// Use for clone
explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
: predictor_(std::move(pred)) {}
explicit Predictor(const Config& config);
std::vector<std::string> GetInputNames();
std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
bool Run();
std::vector<std::string> GetOutputNames();
std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
std::unique_ptr<Predictor> Clone();
void ClearIntermediateTensor();
private:
std::unique_ptr<paddle::PaddlePredictor> predictor_;
};
PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
const Config& config); // NOLINT
PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
PD_INFER_DECL std::string GetVersion();
PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
template <typename T>
void Tensor::CopyFromCpu(const T* data) {
tensor_->copy_from_cpu<T>(data);
}
template <typename T>
void Tensor::CopyToCpu(T* data) {
return tensor_->copy_to_cpu<T>(data);
}
template <typename T>
T* Tensor::mutable_data(PlaceType place) {
return tensor_->mutable_data<T>(place);
}
template <typename T>
T* Tensor::data(PlaceType* place, int* size) const {
return tensor_->data<T>(place, size);
}
} // namespace paddle_infer
namespace paddle_infer {
namespace services {
class PD_INFER_DECL PredictorPool {
public:
PredictorPool() = delete;
PredictorPool(const PredictorPool&) = delete;
PredictorPool& operator=(const PredictorPool&) = delete;
explicit PredictorPool(const Config& config, size_t size = 1);
Predictor* Retrive(size_t idx);
private:
std::shared_ptr<Predictor> main_pred_;
std::vector<std::unique_ptr<Predictor>> preds_;
};
} // namespace services
} // namespace paddle_infer
...@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() { ...@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
LOG(ERROR) << "GPU not support MKL-DNN quantization"; LOG(ERROR) << "GPU not support MKL-DNN quantization";
} }
void GpuPassStrategy::EnableMkldnnBfloat16() {
LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
}
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will // NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones. // not be damaged by smaller ones.
...@@ -181,12 +185,14 @@ void CpuPassStrategy::EnableMKLDNN() { ...@@ -181,12 +185,14 @@ void CpuPassStrategy::EnableMKLDNN() {
passes_.insert(passes_.begin(), "mkldnn_placement_pass"); passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass : std::vector<std::string>({ for (auto &pass : std::vector<std::string>({
"depthwise_conv_mkldnn_pass", // "depthwise_conv_mkldnn_pass", //
"conv_bn_fuse_pass", // Execute BN passes again to "conv_bn_fuse_pass", // Execute BN passes again to
"conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order
"conv_transpose_bn_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_transpose_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bias_mkldnn_fuse_pass", // "conv_transpose_bn_fuse_pass", //
"conv_transpose_eltwiseadd_bn_fuse_pass", //
"conv_bias_mkldnn_fuse_pass", //
"conv_transpose_bias_mkldnn_fuse_pass", "conv_transpose_bias_mkldnn_fuse_pass",
"conv3d_bias_mkldnn_fuse_pass", // "conv3d_bias_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass", "conv_elementwise_add_mkldnn_fuse_pass",
...@@ -223,4 +229,12 @@ void CpuPassStrategy::EnableMkldnnQuantizer() { ...@@ -223,4 +229,12 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
#endif #endif
} }
void CpuPassStrategy::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
use_mkldnn_bfloat16_ = false;
#endif
}
} // namespace paddle } // namespace paddle
...@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \brief Enable MKLDNN quantize optimization. /// \brief Enable MKLDNN quantize optimization.
virtual void EnableMkldnnQuantizer() {} virtual void EnableMkldnnQuantizer() {}
/// \brief Enable MKLDNN bfloat16.
virtual void EnableMkldnnBfloat16() {}
/// \brief Check if we are using gpu. /// \brief Check if we are using gpu.
/// \return A bool variable implying whether we are in gpu mode. /// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
...@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { ...@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
use_gpu_ = other.use_gpu_; use_gpu_ = other.use_gpu_;
use_mkldnn_ = other.use_mkldnn_; use_mkldnn_ = other.use_mkldnn_;
use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_; use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
} }
/// \brief Default destructor. /// \brief Default destructor.
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
...@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { ...@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
/// \brief Enable MKLDNN quantize optimization. /// \brief Enable MKLDNN quantize optimization.
void EnableMkldnnQuantizer() override; void EnableMkldnnQuantizer() override;
/// \brief Enable MKLDNN bfloat16.
void EnableMkldnnBfloat16() override;
protected: protected:
/// \cond Protected /// \cond Protected
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
bool use_mkldnn_bfloat16_{false};
/// \endcond /// \endcond
}; };
...@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \brief Not supported in GPU mode yet. /// \brief Not supported in GPU mode yet.
void EnableMkldnnQuantizer() override; void EnableMkldnnQuantizer() override;
/// \brief Not supported in GPU mode yet.
void EnableMkldnnBfloat16() override;
/// \brief Default destructor. /// \brief Default destructor.
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
......
...@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer( ...@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled( PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
const PD_AnalysisConfig* config); const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config, PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
const char* prog_buffer, const char* prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
......
...@@ -207,6 +207,18 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) { ...@@ -207,6 +207,18 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
return config->config.mkldnn_quantizer_enabled(); return config->config.mkldnn_quantizer_enabled();
} }
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
config->config.EnableMkldnnBfloat16();
}
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
return config->config.mkldnn_bfloat16_enabled();
}
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer, void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer, size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) { size_t params_buffer_size) {
......
...@@ -51,7 +51,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -51,7 +51,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
if (enable_int8) { if (enable_int8) {
#if IS_TRT_VERSION_GE(5000) #if IS_TRT_VERSION_GE(5000)
CHECK(op_desc.HasAttr("Input_scale")); if (op_desc.Type() != "conv2d_transpose") {
PADDLE_ENFORCE_EQ(
op_desc.HasAttr("Input_scale"), true,
platform::errors::InvalidArgument("Input scale not found. TRT int8"
" requires conv/deconv to have "
"input quantization scales."));
}
float in_scale = float in_scale =
BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
auto weight_scale = auto weight_scale =
......
...@@ -186,6 +186,14 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -186,6 +186,14 @@ void TensorRTEngine::FreezeNetwork() {
Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true)); Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
} }
infer_builder_config_->addOptimizationProfile(optim_profile_); infer_builder_config_->addOptimizationProfile(optim_profile_);
infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
if (enable_int8) {
// Due to a bug of TRT, we must set precision BuilderFlag to kFP16 before
// kINT8 here to perform INT8 inference.
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
}
if (WithFp16()) { if (WithFp16()) {
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
if (disable_trt_plugin_fp16()) { if (disable_trt_plugin_fp16()) {
......
...@@ -51,6 +51,7 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -51,6 +51,7 @@ struct SimpleOpTypeSetTeller : public Teller {
"relu", "relu",
"depthwise_conv2d", "depthwise_conv2d",
"softmax", "softmax",
"sigmoid",
"batch_norm", "batch_norm",
"elementwise_add", "elementwise_add",
"leaky_relu", "leaky_relu",
......
...@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
auto ptr = new SkipLayerNormPluginDynamic( auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_); bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
ptr->bias_gpu_ = bias_gpu_; ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = bias_gpu_; ptr->scale_gpu_ = scale_gpu_;
return ptr; return ptr;
} }
......
...@@ -192,7 +192,8 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz") ...@@ -192,7 +192,8 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true) ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150)
set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
# text_classification # text_classification
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
...@@ -430,9 +431,9 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -430,9 +431,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model") set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}) if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz") inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
endif() endif()
inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
...@@ -514,3 +515,9 @@ if(WITH_MKLDNN) ...@@ -514,3 +515,9 @@ if(WITH_MKLDNN)
inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
if(WITH_GPU)
inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RESNET50_MODEL_DIR})
endif()
...@@ -54,6 +54,9 @@ TEST(PD_AnalysisConfig, use_gpu) { ...@@ -54,6 +54,9 @@ TEST(PD_AnalysisConfig, use_gpu) {
PD_SwitchIrOptim(config, true); PD_SwitchIrOptim(config, true);
bool ir_optim = PD_IrOptim(config); bool ir_optim = PD_IrOptim(config);
CHECK(ir_optim) << "NO"; CHECK(ir_optim) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(!bfloat16_enable) << "NO";
PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false, PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
false); false);
bool trt_enable = PD_TensorrtEngineEnabled(config); bool trt_enable = PD_TensorrtEngineEnabled(config);
......
...@@ -88,6 +88,9 @@ TEST(PD_AnalysisConfig, profile_mkldnn) { ...@@ -88,6 +88,9 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
PD_EnableMkldnnQuantizer(config); PD_EnableMkldnnQuantizer(config);
bool quantizer_enable = PD_MkldnnQuantizerEnabled(config); bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
CHECK(quantizer_enable) << "NO"; CHECK(quantizer_enable) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(bfloat16_enable) << "NO";
PD_SetMkldnnCacheCapacity(config, 0); PD_SetMkldnnCacheCapacity(config, 0);
PD_SetModel(config, prog_file.c_str(), params_file.c_str()); PD_SetModel(config, prog_file.c_str(), params_file.c_str());
PD_DeleteAnalysisConfig(config); PD_DeleteAnalysisConfig(config);
......
...@@ -72,3 +72,59 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -72,3 +72,59 @@ TEST(AnalysisPredictor, use_gpu) {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
namespace paddle_infer {
TEST(Predictor, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableLiteEngine(PrecisionType::kFloat32);
auto predictor = CreatePredictor(config);
const int batch = 1;
const int channel = 3;
const int height = 318;
const int width = 318;
const int input_num = batch * channel * height * width;
std::vector<float> input(input_num, 1);
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
input_t->Reshape({1, 3, 318, 318});
input_t->CopyFromCpu(input.data());
predictor->Run();
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
std::vector<float> out_data;
out_data.resize(out_num);
output_t->CopyToCpu(out_data.data());
const std::vector<float> truth_values = {
127.780396f, 738.16656f, 1013.2264f, -438.17206f, 366.4022f,
927.66187f, 736.2241f, -633.68567f, -329.92737f, -430.15637f,
-633.0639f, -146.54858f, -1324.2804f, -1349.3661f, -242.67671f,
117.44864f, -801.7251f, -391.51495f, -404.8202f, 454.16132f,
515.48206f, -133.03114f, 69.293076f, 590.09753f, -1434.6917f,
-1070.8903f, 307.0744f, 400.52573f, -316.12177f, -587.1265f,
-161.05742f, 800.3663f, -96.47157f, 748.708f, 868.17645f,
-447.9403f, 112.73656f, 1127.1992f, 47.43518f, 677.7219f,
593.1881f, -336.4011f, 551.3634f, 397.82474f, 78.39835f,
-715.4006f, 405.96988f, 404.25684f, 246.01978f, -8.430191f,
131.36617f, -648.0528f};
float* data_o = out_data.data();
for (size_t j = 0; j < out_num; j += 10) {
EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
10e-5);
}
}
} // namespace paddle_infer
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda_runtime.h>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cstring>
#include <numeric>
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace paddle_infer {
TEST(Predictor, use_gpu) {
LOG(INFO) << GetVersion();
UpdateDllFlag("conv_workspace_size_limit", "4000");
std::string model_dir = FLAGS_infer_model + "/model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableUseGpu(100, 0);
auto predictor = CreatePredictor(config);
auto pred_clone = predictor->Clone();
std::vector<int> in_shape = {1, 3, 318, 318};
int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
[](int &a, int &b) { return a * b; });
std::vector<float> input(in_num, 0);
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
input_t->Reshape(in_shape);
input_t->CopyFromCpu(input.data());
predictor->Run();
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
std::vector<float> out_data;
out_data.resize(out_num);
output_t->CopyToCpu(out_data.data());
predictor->ClearIntermediateTensor();
}
TEST(PredictorPool, basic) {
LOG(INFO) << GetVersion();
UpdateDllFlag("conv_workspace_size_limit", "4000");
std::string model_dir = FLAGS_infer_model + "/model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableUseGpu(100, 0);
services::PredictorPool pred_pool(config, 4);
auto pred = pred_pool.Retrive(2);
std::vector<int> in_shape = {1, 3, 318, 318};
int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
[](int &a, int &b) { return a * b; });
std::vector<float> input(in_num, 0);
auto in_names = pred->GetInputNames();
auto input_t = pred->GetInputHandle(in_names[0]);
input_t->name();
input_t->Reshape(in_shape);
input_t->CopyFromCpu(input.data());
pred->Run();
auto out_names = pred->GetOutputNames();
auto output_t = pred->GetOutputHandle(out_names[0]);
auto out_type = output_t->type();
LOG(INFO) << GetNumBytesOfDataType(out_type);
if (out_type == DataType::FLOAT32) {
PlaceType place;
int size;
output_t->data<float>(&place, &size);
}
}
} // namespace paddle_infer
...@@ -41,7 +41,7 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -41,7 +41,7 @@ TEST(AnalysisPredictor, use_gpu) {
SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
for (auto& input : inputs_all) { for (auto &input : inputs_all) {
ASSERT_TRUE(predictor->Run(input, &outputs)); ASSERT_TRUE(predictor->Run(input, &outputs));
predictor->ClearIntermediateTensor(); predictor->ClearIntermediateTensor();
} }
...@@ -49,3 +49,27 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -49,3 +49,27 @@ TEST(AnalysisPredictor, use_gpu) {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
namespace paddle_infer {
TEST(PredictorPool, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
Config config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir);
config.EnableTensorRtEngine();
services::PredictorPool pred_pool(config, 1);
auto predictor = pred_pool.Retrive(0);
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
std::vector<int> in_shape = {1, 3, 224, 224};
int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
[](int &a, int &b) { return a * b; });
std::vector<float> input(in_num, 0);
input_t->Reshape(in_shape);
input_t->CopyFromCpu(input.data());
predictor->Run();
}
} // namespace paddle_infer
...@@ -25,12 +25,20 @@ namespace inference { ...@@ -25,12 +25,20 @@ namespace inference {
TEST(quant_int8, resnet50) { TEST(quant_int8, resnet50) {
std::string model_dir = FLAGS_infer_model; std::string model_dir = FLAGS_infer_model;
AnalysisConfig config; AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(1000, 0);
config.SetModel(model_dir); config.SetModel(model_dir);
config.SwitchUseFeedFetchOps(false); config.SwitchUseFeedFetchOps(false);
config.EnableTensorRtEngine(1 << 30, 1, 1, AnalysisConfig::Precision::kInt8, config.EnableTensorRtEngine(1 << 30, 1, 1, AnalysisConfig::Precision::kInt8,
false, false); false, false);
std::map<std::string, std::vector<int>> min_input_shape = {
{"image", {1, 1, 3, 3}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"image", {1, 1, 10, 10}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"image", {1, 1, 3, 3}}};
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
auto predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames(); auto input_names = predictor->GetInputNames();
int channels = 1; int channels = 1;
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
...@@ -243,7 +244,6 @@ UNUSED constexpr char CosDoc[] = R"DOC( ...@@ -243,7 +244,6 @@ UNUSED constexpr char CosDoc[] = R"DOC(
Cosine Operator. Computes cosine of x element-wise. Cosine Operator. Computes cosine of x element-wise.
Input range is `(-inf, inf)` and output range is `[-1,1]`. Input range is `(-inf, inf)` and output range is `[-1,1]`.
Return `nan` if input is out of boundary.
$$out = cos(x)$$ $$out = cos(x)$$
...@@ -341,7 +341,9 @@ $$out = \cos^{-1}(x)$$ ...@@ -341,7 +341,9 @@ $$out = \cos^{-1}(x)$$
class AsinOpMaker : public framework::OpProtoAndCheckerMaker { class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "Input of asin operator"); AddInput("X",
"Input of asin operator, an N-D Tensor, with data type float32, "
"float64 or float16.");
AddOutput("Out", "Output of asin operator"); AddOutput("Out", "Output of asin operator");
AddComment(R"DOC( AddComment(R"DOC(
Arcsine Operator. Arcsine Operator.
...@@ -355,7 +357,9 @@ $$out = \sin^{-1}(x)$$ ...@@ -355,7 +357,9 @@ $$out = \sin^{-1}(x)$$
class AtanOpMaker : public framework::OpProtoAndCheckerMaker { class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "Input of atan operator"); AddInput("X",
"Input of atan operator, an N-D Tensor, with data type float32, "
"float64 or float16.");
AddOutput("Out", "Output of atan operator"); AddOutput("Out", "Output of atan operator");
AddComment(R"DOC( AddComment(R"DOC(
Arctangent Operator. Arctangent Operator.
...@@ -1231,3 +1235,34 @@ REGISTER_OP_CPU_KERNEL( ...@@ -1231,3 +1235,34 @@ REGISTER_OP_CPU_KERNEL(
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::AbsGradFunctor<int64_t>>); ops::AbsGradFunctor<int64_t>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(leaky_relu)
.AddCheckpoint(
R"ROC(fix leaky_relu, bahavior changed when alpha < 0 or alpha > 1)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged(
"leaky_relu calculate formula before checkponit: out = max(x, "
"alpha * x); after checkpoint: out = x if x > 0 else alpha * "
"x"));
REGISTER_OP_VERSION(hard_shrink)
.AddCheckpoint(
R"ROC(fix hard_shrink, bahavior changed when threshold<0)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged(
"hard_shrink calculate formula before checkponit: out = x * "
"((x < -threshold) + (x > threshold)); after checkpoint: out = "
"x * (((x < -threshold) + (x > threshold)) > 0)"));
REGISTER_OP_VERSION(softplus)
.AddCheckpoint(
R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
" softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ \\text{For numerical"
" stability, the implementation reverts to the linear function when: beta * x > threshold.})ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("beta", "The beta value of the new formula", 1.0f)
.NewAttr("threshold", "The threshold value of the new formula",
20.0f));
/* ========================================================================== */
...@@ -64,11 +64,11 @@ class BernoulliOpKernel<platform::CPUDeviceContext, T> ...@@ -64,11 +64,11 @@ class BernoulliOpKernel<platform::CPUDeviceContext, T>
int64_t size = x->numel(); int64_t size = x->numel();
std::uniform_real_distribution<T> dist(0.0, 1.0); std::uniform_real_distribution<T> dist(0.0, 1.0);
auto gen_ptr = framework::Generator::GetInstance(); auto gen_ptr = framework::DefaultCPUGenerator();
std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine(); auto engine = gen_ptr->GetCPUEngine();
for (int64_t i = 0; i < size; ++i) { for (int64_t i = 0; i < size; ++i) {
out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine)); out_data[i] = BernoulliFunctor(in_data[i], dist(*engine));
} }
} }
}; // namespace operators }; // namespace operators
......
...@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T> ...@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T>
class ClipKernel : public framework::OpKernel<T> { class ClipKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max = static_cast<T>(context.Attr<float>("max")); auto max = context.Attr<T>("max");
Tensor max_cpu; Tensor max_cpu;
if (context.HasInput("Max")) { if (context.HasInput("Max")) {
auto* max_t = context.Input<Tensor>("Max"); auto* max_t = context.Input<Tensor>("Max");
...@@ -77,9 +77,8 @@ class ClipKernel : public framework::OpKernel<T> { ...@@ -77,9 +77,8 @@ class ClipKernel : public framework::OpKernel<T> {
} }
max = max_data[0]; max = max_data[0];
} }
max = static_cast<T>(max);
auto min = context.Attr<float>("min"); auto min = context.Attr<T>("min");
Tensor min_cpu; Tensor min_cpu;
if (context.HasInput("Min")) { if (context.HasInput("Min")) {
auto* min_t = context.Input<Tensor>("Min"); auto* min_t = context.Input<Tensor>("Min");
...@@ -90,11 +89,12 @@ class ClipKernel : public framework::OpKernel<T> { ...@@ -90,11 +89,12 @@ class ClipKernel : public framework::OpKernel<T> {
} }
min = min_data[0]; min = min_data[0];
} }
min = static_cast<T>(min);
PADDLE_ENFORCE_LT(min, max, platform::errors::InvalidArgument( PADDLE_ENFORCE_LE(min, max,
"max should be greater than min. " platform::errors::InvalidArgument(
"But received min = %f, max = %f", "max should be greater than or equal to min. "
min, max)); "But received min = %f, max = %f",
min, max));
auto* x_var = context.InputVar("X"); auto* x_var = context.InputVar("X");
if (x_var->IsType<framework::LoDTensor>()) { if (x_var->IsType<framework::LoDTensor>()) {
...@@ -141,7 +141,7 @@ template <typename DeviceContext, typename T> ...@@ -141,7 +141,7 @@ template <typename DeviceContext, typename T>
class ClipGradKernel : public framework::OpKernel<T> { class ClipGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max = static_cast<T>(context.Attr<float>("max")); auto max = context.Attr<T>("max");
Tensor max_cpu; Tensor max_cpu;
if (context.HasInput("Max")) { if (context.HasInput("Max")) {
auto* max_t = context.Input<Tensor>("Max"); auto* max_t = context.Input<Tensor>("Max");
...@@ -152,9 +152,8 @@ class ClipGradKernel : public framework::OpKernel<T> { ...@@ -152,9 +152,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
} }
max = max_data[0]; max = max_data[0];
} }
max = static_cast<T>(max);
auto min = context.Attr<float>("min"); auto min = context.Attr<T>("min");
Tensor min_cpu; Tensor min_cpu;
if (context.HasInput("Min")) { if (context.HasInput("Min")) {
auto* min_t = context.Input<Tensor>("Min"); auto* min_t = context.Input<Tensor>("Min");
...@@ -165,7 +164,6 @@ class ClipGradKernel : public framework::OpKernel<T> { ...@@ -165,7 +164,6 @@ class ClipGradKernel : public framework::OpKernel<T> {
} }
min = min_data[0]; min = min_data[0];
} }
min = static_cast<T>(min);
auto* d_out = auto* d_out =
context.Input<framework::LoDTensor>(framework::GradVarName("Out")); context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
......
...@@ -35,5 +35,9 @@ if(WITH_NCCL) ...@@ -35,5 +35,9 @@ if(WITH_NCCL)
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
endif() endif()
if(WITH_GLOO)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
endif()
set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h"
#include <memory>
namespace paddle {
namespace operators {
class BarrierOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {}
};
class BarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor) Input data (only used in CUDAKernel).");
AddOutput("Out", "(Tensor) Output data (only used in CUDAKernel).");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
AddComment(R"DOC(
Barrier Operator - Barrier among all pariticapitors.)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(barrier, ops::BarrierOp, ops::BarrierOpMaker);
REGISTER_OP_CPU_KERNEL(barrier, ops::BarrierOpCPUKernel<int>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h"
#include <memory>
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class BarrierOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto place = ctx.GetPlace();
ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
int64_t numel = in->numel();
const void* sendbuff = in->data<void>();
void* recvbuff = out->mutable_data<T>(place);
int rid = ctx.Attr<int>("ring_id");
auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
ncclRedOp_t nccl_red_type = ncclSum;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
auto comm_stream =
platform::NCCLCommContext::Instance().Get(rid, place)->stream();
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with NCCL."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(barrier, ops::BarrierOpCUDAKernel<int>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_GLOO)
#include <gloo/barrier.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class BarrierOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_GLOO)
auto gloo = paddle::framework::GlooWrapper::GetInstance();
PADDLE_ENFORCE_EQ(
gloo->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"You must initialize the gloo environment first to use it."));
gloo::BarrierOptions opts(gloo->GetContext());
gloo::barrier(opts);
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
#endif
}
};
} // namespace operators
} // namespace paddle
...@@ -23,6 +23,11 @@ limitations under the License. */ ...@@ -23,6 +23,11 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_GLOO)
#include <gloo/allgather.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -30,7 +35,31 @@ template <typename T> ...@@ -30,7 +35,31 @@ template <typename T>
class CAllGatherOpCPUKernel : public framework::OpKernel<T> { class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW("unimplemented cpu kernel for CAllGatherOp."); #if defined(PADDLE_WITH_GLOO)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
framework::DDim out_dims = in->dims();
auto place = ctx.GetPlace();
auto gloo = paddle::framework::GlooWrapper::GetInstance();
auto nranks = gloo->Size();
out_dims[0] *= nranks;
int64_t send_numel = in->numel();
const T* send_buff = in->data<T>();
T* recv_buff = out->mutable_data<T>(out_dims, place);
PADDLE_ENFORCE_EQ(
gloo->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"You must initialize the gloo environment first to use it."));
gloo::AllgatherOptions opts(gloo->GetContext());
opts.setInput(const_cast<T*>(send_buff), send_numel);
opts.setOutput(recv_buff, send_numel * nranks);
gloo::allgather(opts);
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
#endif
} }
}; };
......
...@@ -25,6 +25,11 @@ limitations under the License. */ ...@@ -25,6 +25,11 @@ limitations under the License. */
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
#if defined(PADDLE_WITH_GLOO)
#include <gloo/allreduce.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -50,7 +55,53 @@ template <ReduceType red_type, typename T> ...@@ -50,7 +55,53 @@ template <ReduceType red_type, typename T>
class CAllReduceOpCPUKernel : public framework::OpKernel<T> { class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW("CAllReduce op do not support CPUKernel for now."); #if defined(PADDLE_WITH_GLOO)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto place = ctx.GetPlace();
int64_t send_numel = in->numel();
const T* send_buff = in->data<T>();
T* recv_buff = out->mutable_data<T>(in->dims(), place);
auto gloo = paddle::framework::GlooWrapper::GetInstance();
PADDLE_ENFORCE_EQ(
gloo->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"You must initialize the gloo environment first to use it."));
gloo::AllreduceOptions opts(gloo->GetContext());
opts.setInput(const_cast<T*>(send_buff), send_numel);
opts.setOutput(recv_buff, send_numel);
switch (red_type) {
case kRedSum:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::sum<T>));
break;
case kRedMax:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::max<T>));
break;
case kRedMin:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::min<T>));
break;
case kRedProd:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::product<T>));
break;
default:
PADDLE_ENFORCE_EQ(true, false,
platform::errors::InvalidArgument(
"Invalid reduce type: %d.", red_type));
}
gloo::allreduce(opts);
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
#endif
} }
}; };
......
...@@ -22,6 +22,11 @@ limitations under the License. */ ...@@ -22,6 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_GLOO)
#include <gloo/broadcast.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,7 +34,27 @@ template <typename T> ...@@ -29,7 +34,27 @@ template <typename T>
class CBroadcastOpCPUKernel : public framework::OpKernel<T> { class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW("Unimplemented cpu kernel for CBroadcastOp."); #if defined(PADDLE_WITH_GLOO)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto root = ctx.Attr<int>("root");
auto place = ctx.GetPlace();
int64_t send_numel = in->numel();
T* recv_buff = out->mutable_data<T>(in->dims(), place);
auto gloo = paddle::framework::GlooWrapper::GetInstance();
PADDLE_ENFORCE_EQ(
gloo->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"You must initialize the gloo environment first to use it."));
gloo::BroadcastOptions opts(gloo->GetContext());
opts.setOutput(recv_buff, send_numel);
opts.setRoot(root);
gloo::broadcast(opts);
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
#endif
} }
}; };
......
...@@ -28,6 +28,10 @@ limitations under the License. */ ...@@ -28,6 +28,10 @@ limitations under the License. */
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
#if defined(PADDLE_WITH_GLOO)
#include <gloo/reduce.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -54,9 +58,55 @@ template <ReduceType red_type, typename T> ...@@ -54,9 +58,55 @@ template <ReduceType red_type, typename T>
class CReduceOpCPUKernel : public framework::OpKernel<T> { class CReduceOpCPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_GLOO)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto root_id = ctx.Attr<int>("root_id");
auto place = ctx.GetPlace();
int64_t send_numel = in->numel();
const T* send_buff = in->data<T>();
T* recv_buff = out->mutable_data<T>(in->dims(), place);
auto gloo = paddle::framework::GlooWrapper::GetInstance();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
true, false, gloo->IsInitialized(), true,
platform::errors::Unavailable("Unimplemented CReduceOpCPUKernel now.")); platform::errors::PreconditionNotMet(
"You must initialize the gloo environment first to use it."));
gloo::ReduceOptions opts(gloo->GetContext());
opts.setInput(const_cast<T*>(send_buff), send_numel);
opts.setOutput(recv_buff, send_numel);
opts.setRoot(root_id);
switch (red_type) {
case kRedSum:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::sum<T>));
break;
case kRedMax:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::max<T>));
break;
case kRedMin:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::min<T>));
break;
case kRedProd:
opts.setReduceFunction(
static_cast<void (*)(void*, const void*, const void*, size_t)>(
&gloo::product<T>));
break;
default:
PADDLE_ENFORCE_EQ(true, false,
platform::errors::InvalidArgument(
"Invalid reduce type: %d.", red_type));
}
gloo::reduce(opts);
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
#endif
} }
}; };
......
...@@ -22,6 +22,11 @@ limitations under the License. */ ...@@ -22,6 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_GLOO)
#include <gloo/scatter.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,9 +34,39 @@ template <typename T> ...@@ -29,9 +34,39 @@ template <typename T>
class CScatterOpCPUKernel : public framework::OpKernel<T> { class CScatterOpCPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(true, false, #if defined(PADDLE_WITH_GLOO)
platform::errors::Unavailable( auto in = ctx.Input<framework::Tensor>("X");
"Unimplemented cpu kernel for CScatterOp.")); auto out = ctx.Output<framework::Tensor>("Out");
auto root_id = ctx.Attr<int>("root");
auto gloo = paddle::framework::GlooWrapper::GetInstance();
PADDLE_ENFORCE_EQ(
gloo->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"You must initialize the gloo environment first to use it."));
int64_t send_numel = out->numel();
auto nranks = gloo->Size();
auto rank = gloo->Rank();
T* recv_buff = out->data<T>();
gloo::ScatterOptions opts(gloo->GetContext());
if (root_id == rank) {
T* send_buff = const_cast<T*>(in->data<T>());
std::vector<T*> ptrs(nranks);
for (int i = 0; i < nranks; ++i) {
ptrs[i] = send_buff;
send_buff += send_numel;
}
opts.setInputs(ptrs, send_numel);
}
opts.setOutput(recv_buff, send_numel);
opts.setRoot(root_id);
gloo::scatter(opts);
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
#endif
} }
}; };
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h" #include "paddle/fluid/operators/controlflow/logical_op.h"
#include <algorithm>
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -97,19 +99,19 @@ class BinaryLogicalOp : public LogicalOp { ...@@ -97,19 +99,19 @@ class BinaryLogicalOp : public LogicalOp {
OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type); OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type);
auto dim_x = context->GetInputDim("X"); auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Y"); auto dim_y = context->GetInputDim("Y");
if (dim_x == dim_y) {
int product_x = framework::product(dim_x); context->SetOutputDim("Out", dim_x);
int product_y = framework::product(dim_y); } else {
bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0); int max_dim = std::max(dim_x.size(), dim_y.size());
if (check) { int axis = std::abs(dim_x.size() - dim_y.size());
PADDLE_ENFORCE_EQ(product_x, product_y, std::vector<int> x_dims_array(max_dim);
platform::errors::InvalidArgument( std::vector<int> y_dims_array(max_dim);
"The number of elements in X and Y should be same, " std::vector<int> out_dims_array(max_dim);
"but received %d != %d", GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
product_x, product_y)); y_dims_array.data(), out_dims_array.data(),
max_dim, axis);
context->SetOutputDim("Out", framework::make_ddim(out_dims_array));
} }
context->SetOutputDim("Out", context->GetInputDim("X"));
context->ShareLoD("X", "Out"); context->ShareLoD("X", "Out");
} }
}; };
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <math.h> #include <math.h>
#include <type_traits> #include <type_traits>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
...@@ -57,10 +58,8 @@ class BinaryLogicalOpKernel ...@@ -57,10 +58,8 @@ class BinaryLogicalOpKernel
auto* y = context.Input<framework::Tensor>("Y"); auto* y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
Functor binary_func; Functor binary_func;
platform::Transform<DeviceContext> trans; ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
trans(context.template device_context<DeviceContext>(), x->data<T>(), binary_func, out);
x->data<T>() + x->numel(), y->data<T>(),
out->mutable_data<bool>(context.GetPlace()), binary_func);
} }
}; };
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
...@@ -567,3 +568,14 @@ REGISTER_OP_CPU_KERNEL( ...@@ -567,3 +568,14 @@ REGISTER_OP_CPU_KERNEL(
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>, ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>); double>);
REGISTER_OP_VERSION(conv_transpose)
.AddCheckpoint(
R"ROC(
Upgrade convtranspose add a new attribute [output_padding].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"output_padding",
"In order to add additional size to one side of each dimension "
"in the output",
{}));
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/cum_op.h" #include "paddle/fluid/operators/cum_op.h"
namespace paddle { namespace paddle {
...@@ -95,3 +96,14 @@ REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>, ...@@ -95,3 +96,14 @@ REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
ops::CumKernel<CPU, ops::CumsumFunctor<double>>, ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
ops::CumKernel<CPU, ops::CumsumFunctor<int>>, ops::CumKernel<CPU, ops::CumsumFunctor<int>>,
ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>); ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>);
REGISTER_OP_VERSION(cumsum)
.AddCheckpoint(
R"ROC(
Upgrade cumsum add a new attribute [flatten].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"flatten",
"In order to compute the cumsum over the flattened array when the "
"argument `axis` in python API is None.",
false));
...@@ -56,7 +56,7 @@ endif() ...@@ -56,7 +56,7 @@ endif()
cc_test(rpc_server_test SRCS rpc_server_test.cc cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op) DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op scale_op)
cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
......
...@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h, ...@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h,
&trainer_id); &trainer_id);
} }
void ProcGetRecvResponse(const VarHandle& var_h,
const ::grpc::ByteBuffer& ret_msg) {
VLOG(4) << "ProcGetRecvResponse";
framework::Variable* outvar = nullptr;
int trainer_id;
DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
&trainer_id);
}
template <typename T> template <typename T>
void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
::grpc::Slice slice(proto.ByteSizeLong()); ::grpc::Slice slice(proto.ByteSizeLong());
...@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify( ...@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
return h; return h;
} }
VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& send_var_name,
const std::string& recv_var_name,
const std::string& table_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string send_var_name_val = send_var_name;
const std::string recv_var_name_val = recv_var_name;
const std::string table_name_val = table_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
const std::string method = kSendAndRecvRPC;
VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
<< send_var_name_val << " Recv_var_name: " << recv_var_name_val;
int retry_times_ = 0;
while (true) {
SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
VarHandlePtr h(
new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
VarHandlePtr h_recv(
new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
s->RecvPrepare(h_recv);
framework::AsyncIO([send_var_name_val, recv_var_name_val, table_name_val,
p_scope, p_ctx, s, method, h, this] {
auto* send_var = p_scope->FindVar(send_var_name_val);
send_var->GetMutable<framework::LoDTensor>()->set_lod({});
::grpc::ByteBuffer buf;
VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
<< send_var_name_val
<< " recv_var_name_val: " << recv_var_name_val;
SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
recv_var_name_val, trainer_id_, table_name_val);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context
s->response_call_back_ = ProcGetRecvResponse;
platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
buf, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
});
req_count_++;
if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
h->Wait();
if (h->should_retry) {
VLOG(3) << "rpc call failed, retry times " << retry_times_;
retry_times_++;
std::random_device rd;
std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
continue;
}
}
return h;
}
}
bool GRPCClient::Wait() { bool GRPCClient::Wait() {
std::unique_lock<std::mutex> lk(sync_mutex_); std::unique_lock<std::mutex> lk(sync_mutex_);
sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
......
...@@ -53,6 +53,8 @@ namespace distributed { ...@@ -53,6 +53,8 @@ namespace distributed {
void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
class BaseProcessor { class BaseProcessor {
public: public:
BaseProcessor() { context_ = nullptr; } BaseProcessor() { context_ = nullptr; }
...@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor { ...@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor {
RequestGetCallBack response_call_back_ = ProcGetResponse; RequestGetCallBack response_call_back_ = ProcGetResponse;
}; };
class SendAndRecvProcessor : public BaseProcessor {
public:
explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(), stub_g_(ch) {}
virtual ~SendAndRecvProcessor() {}
void ProcessImpl() override {
if (response_call_back_) {
response_call_back_(*var_h_recv_.get(), reply_);
var_h_recv_->Finish(true);
}
}
void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
::grpc::ByteBuffer reply_;
::grpc::GenericStub stub_g_;
RequestGetCallBack response_call_back_ = ProcGetResponse;
VarHandlePtr var_h_recv_;
};
class BatchBarrierProcessor : public BaseProcessor { class BatchBarrierProcessor : public BaseProcessor {
public: public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch) explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
...@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient { ...@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient {
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendAndRecv(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& send_var_name,
const std::string& recv_var_name,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendComplete( VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
......
...@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
PADDLE_THROW("Serialize does not support type: %s", PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name()); typeid(var->Type()).name());
} }
std::string header; std::string header;
request.AppendToString(&header); request.AppendToString(&header);
auto buffer = std::unique_ptr<char[]>(new char[1024]); auto buffer = std::unique_ptr<char[]>(new char[1024]);
...@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
} }
#endif #endif
PADDLE_ENFORCE_NOT_NULL(payload); PADDLE_ENFORCE_NOT_NULL(payload);
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
payload->memory_size()); payload->memory_size());
if (payload->memory_size() >= std::numeric_limits<int>::max()) { if (payload->memory_size() >= std::numeric_limits<int>::max()) {
...@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
::grpc::Slice::STEAL_REF); ::grpc::Slice::STEAL_REF);
num_slices = 4; num_slices = 4;
} }
::grpc::ByteBuffer tmp(&slices[0], num_slices); ::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp); msg->Swap(&tmp);
} }
...@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
*trainer_id = resp.GetTrainerId(); *trainer_id = resp.GetTrainerId();
} }
void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id) {
platform::RecordRPCEvent record_event("deserial");
operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE_EQ(
resp.Parse(msg), 0,
platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
*var = resp.GetRecvVar();
*trainer_id = resp.GetTrainerId();
}
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var, int* trainer_id); framework::Variable** var, int* trainer_id);
void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id);
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port); ...@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
enum CallStatus { PROCESS = 0, FINISH }; enum CallStatus { PROCESS = 0, FINISH };
// reference: // reference:
...@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase { ...@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase {
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_; ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
}; };
class RequestSendAndRecv final : public RequestBase {
public:
explicit RequestSendAndRecv(GrpcService::AsyncService* service,
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
request_.reset(new GRPCVariableResponse(
request_handler->scope(), request_handler->dev_ctx(),
request_handler->distributed_mode()));
int method_id =
static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
}
virtual ~RequestSendAndRecv() {}
std::string GetReqName() override { return request_->Varname(); }
void Process() override {
std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname();
std::string table_name = request_->TableName();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name << " trainer: " << trainer_id;
auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name);
framework::Variable* outvar = nullptr;
request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
out_var_name, table_name);
SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
&reply_);
Finish(reply_, &responder_);
}
protected:
std::shared_ptr<GRPCVariableResponse> request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
};
void AsyncGRPCServer::WaitServerReady() { void AsyncGRPCServer::WaitServerReady() {
VLOG(4) << "AsyncGRPCServer is waiting server ready"; VLOG(4) << "AsyncGRPCServer is waiting server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_); std::unique_lock<std::mutex> lock(this->mutex_ready_);
...@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, ...@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id); b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
} else if (rpc_name == kRequestNotify) { } else if (rpc_name == kRequestNotify) {
b = new RequestNotify(service_.get(), cq.get(), handler, req_id); b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
} else if (rpc_name == kRequestSendAndRecv) {
b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
} else { } else {
PADDLE_ENFORCE(false, "not supported rpc"); PADDLE_ENFORCE(false, "not supported rpc");
} }
......
...@@ -85,10 +85,12 @@ enum class GrpcMethod { ...@@ -85,10 +85,12 @@ enum class GrpcMethod {
kGetMonomerVariable, kGetMonomerVariable,
kGetMonomerBarrier, kGetMonomerBarrier,
kRequestNotify, kRequestNotify,
kRequestSendAndRecv,
// when you add new handler, change kGrpcNumMethods at the same time!
}; };
static const int kGrpcNumMethods = static const int kGrpcNumMethods =
static_cast<int>(GrpcMethod::kRequestNotify) + 1; static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
inline const char* GrpcMethodName(GrpcMethod id) { inline const char* GrpcMethodName(GrpcMethod id) {
switch (id) { switch (id) {
...@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { ...@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
return "/sendrecv.SendRecvService/CheckpointNotify"; return "/sendrecv.SendRecvService/CheckpointNotify";
case GrpcMethod::kRequestNotify: case GrpcMethod::kRequestNotify:
return "/sendrecv.SendRecvService/DistributeNotify"; return "/sendrecv.SendRecvService/DistributeNotify";
case GrpcMethod::kRequestSendAndRecv:
return "/sendrecv.SendRecvService/SendAndRecvVariable";
} }
// Shouldn't be reached. // Shouldn't be reached.
......
...@@ -14,20 +14,19 @@ ...@@ -14,20 +14,19 @@
#pragma once #pragma once
#include <ThreadPool.h>
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <functional> #include <functional>
#include <future> // NOLINT #include <future> // NOLINT
#include <memory> #include <memory>
#include <string> #include <string>
#include <thread> // NOLINT
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <thread> // NOLINT
#include <ThreadPool.h>
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/rw_lock.h"
...@@ -89,26 +88,17 @@ class UniformInitializer : public Initializer { ...@@ -89,26 +88,17 @@ class UniformInitializer : public Initializer {
min_ = std::stof(attrs[2]); min_ = std::stof(attrs[2]);
max_ = std::stof(attrs[3]); max_ = std::stof(attrs[3]);
if (seed_ == 0) {
seed_ = std::random_device()();
}
random_engine_.seed(seed_);
dist_ = std::uniform_real_distribution<float>(min_, max_); dist_ = std::uniform_real_distribution<float>(min_, max_);
random_engine_ = framework::GetCPURandomEngine(seed_);
} }
float GetValue() override { float GetValue() override { return dist_(*random_engine_); }
return framework::Generator::GetInstance()->is_init_py
? dist_(framework::Generator::GetInstance()->GetCPUEngine())
: dist_(random_engine_);
// return dist_(random_engine_);
}
private: private:
float min_; float min_;
float max_; float max_;
std::minstd_rand random_engine_; std::shared_ptr<std::mt19937_64> random_engine_;
std::uniform_real_distribution<float> dist_; std::uniform_real_distribution<float> dist_;
}; };
...@@ -139,26 +129,18 @@ class GaussianInitializer : public Initializer { ...@@ -139,26 +129,18 @@ class GaussianInitializer : public Initializer {
mean_ = std::stof(attrs[2]); mean_ = std::stof(attrs[2]);
std_ = std::stof(attrs[3]); std_ = std::stof(attrs[3]);
if (seed_ == 0) { random_engine_ = framework::GetCPURandomEngine(seed_);
seed_ = std::random_device()();
}
random_engine_.seed(seed_);
dist_ = std::normal_distribution<float>(mean_, std_); dist_ = std::normal_distribution<float>(mean_, std_);
} }
float GetValue() override { float GetValue() override { return dist_(*random_engine_); }
return framework::Generator::GetInstance()->is_init_py
? dist_(framework::Generator::GetInstance()->GetCPUEngine())
: dist_(random_engine_);
// return dist_(random_engine_);
}
private: private:
float std_; float std_;
float mean_; float mean_;
std::minstd_rand random_engine_; std::shared_ptr<std::mt19937_64> random_engine_;
std::normal_distribution<float> dist_; std::normal_distribution<float> dist_;
}; };
......
...@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; ...@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
constexpr char kRequestNotify[] = "RequestNotify"; constexpr char kRequestNotify[] = "RequestNotify";
constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
constexpr char kSendRPC[] = "SendRPC"; constexpr char kSendRPC[] = "SendRPC";
constexpr char kGetRPC[] = "GetRPC"; constexpr char kGetRPC[] = "GetRPC";
...@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; ...@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
constexpr int64_t kPrefetchTimeout = 60000; constexpr int64_t kPrefetchTimeout = 60000;
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
......
...@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname, ...@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname,
return true; return true;
} }
bool RequestSendAndRecvHandler::Handle(const std::string &varname,
framework::Scope *Scope,
framework::Variable *var,
framework::Variable **outvar,
const int trainer_id,
const std::string &out_var_name,
const std::string &table_name) {
VLOG(3) << "SendAndRecvHandle: " << varname
<< " out_var_name: " << out_var_name
<< " , trainer_id: " << trainer_id;
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
*outvar = Scope->FindVar(out_var_name);
return true;
}
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler { ...@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler {
std::unordered_map<int, int64_t> decay_counters; std::unordered_map<int, int64_t> decay_counters;
}; };
class RequestSendAndRecvHandler final : public RequestHandler {
public:
explicit RequestSendAndRecvHandler(int distributed_mode)
: RequestHandler(distributed_mode) {}
virtual ~RequestSendAndRecvHandler() {}
bool Handle(const std::string& varname, framework::Scope* Scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override;
};
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -85,6 +85,12 @@ class RPCClient { ...@@ -85,6 +85,12 @@ class RPCClient {
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendAndRecv(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& send_var_name,
const std::string& recv_var_name, const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendComplete( virtual VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
......
...@@ -35,27 +35,24 @@ namespace platform = paddle::platform; ...@@ -35,27 +35,24 @@ namespace platform = paddle::platform;
namespace distributed = paddle::operators::distributed; namespace distributed = paddle::operators::distributed;
USE_NO_KERNEL_OP(lookup_sparse_table_read); USE_NO_KERNEL_OP(lookup_sparse_table_read);
USE_OP(scale);
std::unique_ptr<distributed::RPCServer> g_rpc_service; std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<distributed::RequestHandler> g_req_handler; std::unique_ptr<distributed::RequestHandler> g_req_handler;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0); auto root_block = program->MutableBlock(0);
auto* block = program->AppendBlock(*root_block); auto* block = program->AppendBlock(*root_block);
framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}}); framework::OpDesc* op = block->AppendOp();
framework::VariableNameMap output({{"Output", {"out"}}}); op->SetType("scale");
auto op = block->AppendOp(); op->SetInput("X", {"x"});
op->SetType("lookup_sparse_table_read"); op->SetOutput("Out", {"res"});
op->SetInput("W", {"w"}); op->SetAttr("scale", 0.5f);
op->SetInput("Ids", {"ids"});
op->SetOutput("Out", {"out"}); auto& out = *root_block->Var("res");
op->SetAttr("tablename", {"w"});
op->SetAttr("value_names", {"Param"});
auto& out = *root_block->Var("out");
out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetShape({10, 10}); out.SetShape({1, 10});
return block; return block;
} }
...@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { ...@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
auto ids_var = scope->Var("ids"); auto ids_var = scope->Var("ids");
ids_var->GetMutable<framework::LoDTensor>(); ids_var->GetMutable<framework::LoDTensor>();
auto x_var = scope->Var("x");
x_var->GetMutable<framework::LoDTensor>();
auto res_var = scope->Var("res");
res_var->GetMutable<framework::LoDTensor>();
} }
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
...@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, ...@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t* ids_ptr = int64_t* ids_ptr =
ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place); ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
float* x_ptr =
x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
} }
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
...@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) { ...@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) {
server_thread.join(); server_thread.join();
} }
void StartSendAndRecvServer(const std::string& rpc_name) {
framework::ProgramDesc program;
framework::Scope scope;
platform::CPUPlace place;
framework::Executor exe(place);
platform::CPUDeviceContext ctx(place);
auto block = AppendSendAndRecvBlock(&program);
std::string in_var_name("x");
std::vector<int> prefetch_block_ids{block->ID()};
auto prepared = exe.Prepare(program, prefetch_block_ids);
InitTensorsOnServer(&scope, &place, 10);
std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>
grad_to_prepared_ctx;
grad_to_prepared_ctx[in_var_name] = prepared[0];
g_req_handler->SetProgram(&program);
g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
g_req_handler->SetDevCtx(&ctx);
g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread(
std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
server_thread.join();
}
TEST(COMPLETE, CPU) { TEST(COMPLETE, CPU) {
setenv("http_proxy", "", 1); setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1); setenv("https_proxy", "", 1);
...@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) { ...@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) {
g_rpc_service.reset(nullptr); g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr); g_req_handler.reset(nullptr);
} }
TEST(SENDANDRECV, CPU) {
setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1);
g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
distributed::DistributedMode::kAsync));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
PADDLE_ENFORCE_NE(client, nullptr,
platform::errors::InvalidArgument(
"Client Start Fail, Check Your Code & Env"));
std::thread server_thread(StartSendAndRecvServer,
distributed::kRequestSendAndRecv);
g_rpc_service->WaitServerReady();
int port = g_rpc_service->GetSelectedPort();
std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
framework::Scope scope;
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// create var on local scope
int64_t rows_numel = 10;
InitTensorsOnClient(&scope, &place, rows_numel);
std::string in_var_name("x");
std::string out_var_name("res");
client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
client->Wait();
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::LoDTensor>();
auto ptr = value->mutable_data<float>(place);
for (int64_t i = 0; i < rows_numel; ++i) {
EXPECT_EQ(ptr[i], 0.5);
}
g_rpc_service->ShutDown();
server_thread.join();
LOG(INFO) << "begin reset";
g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr);
}
...@@ -29,7 +29,7 @@ service SendRecvService { ...@@ -29,7 +29,7 @@ service SendRecvService {
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
rpc DistributeNotify(VariableMessage) returns (VoidMessage) {} rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
} }
......
...@@ -96,6 +96,13 @@ class VariableResponse { ...@@ -96,6 +96,13 @@ class VariableResponse {
return scope_->FindVar(meta_.varname()); return scope_->FindVar(meta_.varname());
} }
framework::Variable* GetRecvVar() {
if (create_scope_) {
return local_scope_->Var(meta_.out_varname());
}
return scope_->FindVar(meta_.out_varname());
}
int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); } int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
protected: protected:
......
...@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2, PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks"); "server program should have at least 2 blocks");
std::vector<int> block_list; std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) { for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid); block_list.push_back(blkid);
...@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
while (true) { while (true) {
if (rpc_service_->IsExit()) { if (rpc_service_->IsExit()) {
...@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
new distributed::RequestGetNoBarrierHandler()); new distributed::RequestGetNoBarrierHandler());
request_notify_handler_.reset( request_notify_handler_.reset(
new distributed::RequestNotifyHandler(distributed_mode, fan_in)); new distributed::RequestNotifyHandler(distributed_mode, fan_in));
request_send_and_recv_handler_.reset(
new distributed::RequestSendAndRecvHandler(distributed_mode));
rpc_service_->RegisterRPC(distributed::kRequestSend, rpc_service_->RegisterRPC(distributed::kRequestSend,
request_send_handler_.get(), rpc_send_thread_num); request_send_handler_.get(), rpc_send_thread_num);
...@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
request_get_no_barrier_handler_.get()); request_get_no_barrier_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestNotify, rpc_service_->RegisterRPC(distributed::kRequestNotify,
request_notify_handler_.get(), rpc_send_thread_num); request_notify_handler_.get(), rpc_send_thread_num);
rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
request_send_and_recv_handler_.get(),
rpc_get_thread_num);
auto optimize_blocks = auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
...@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
"optimize blocks is less than 1. Optimize blocks " "optimize blocks is less than 1. Optimize blocks "
"should be 1 at least on the pserver side.")); "should be 1 at least on the pserver side."));
auto *program = optimize_blocks[0]->Program(); auto *program = optimize_blocks[0]->Program();
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr; std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
...@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
f(request_checkpoint_handler_.get()); f(request_checkpoint_handler_.get());
f(request_get_no_barrier_handler_.get()); f(request_get_no_barrier_handler_.get());
f(request_notify_handler_.get()); f(request_notify_handler_.get());
f(request_send_and_recv_handler_.get());
// register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
signal(SIGINT, SignalHandler::StopAndExit); signal(SIGINT, SignalHandler::StopAndExit);
......
...@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase {
mutable std::shared_ptr<distributed::RequestHandler> mutable std::shared_ptr<distributed::RequestHandler>
request_checkpoint_handler_; request_checkpoint_handler_;
mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
mutable std::shared_ptr<distributed::RequestHandler>
request_send_and_recv_handler_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
mutable std::vector<std::string> sparse_vars_; mutable std::vector<std::string> sparse_vars_;
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/distributed/communicator.h"
#include "paddle/fluid/operators/distributed/communicator_common.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/parameter_send.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SendAndRecvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope();
const auto& place = ctx.GetPlace();
auto send_var_name = ctx.Attr<std::string>("send_var_name");
auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
auto epmap = ctx.Attr<std::string>("endpoint");
auto trainer_id = ctx.Attr<int>("trainer_id");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& context = *pool.Get(place);
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
<< " Recv_var_name: " << recv_var_name;
distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
epmap, context, scope, send_var_name, recv_var_name);
rets->Wait();
}
};
class SendAndRecvOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(data_type, platform::CPUPlace());
}
};
class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
AddAttr<std::string>("send_var_name", "Send Tensor's name")
.SetDefault(std::string(""));
AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
.SetDefault(std::string(""));
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::string>("endpoint", "Server endpoint")
.SetDefault({"127.0.0.1:6164"});
AddComment(R"DOC(
SendAndRecv operator
This operator will send variables to listen_and_serve op at the parameter server.
And recv variable from parameter server of send variable's scope.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
REGISTER_OP_CPU_KERNEL(
send_and_recv,
ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
...@@ -55,30 +55,22 @@ class CPUDropoutKernel : public framework::OpKernel<T> { ...@@ -55,30 +55,22 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT
return; return;
} }
// std::minstd_rand engine;
bool init_generator_py = framework::Generator::GetInstance()->is_init_py;
// NOTE: fixed seed should only be used in unittest or for debug. // NOTE: fixed seed should only be used in unittest or for debug.
// Guarantee to use random seed in training. // Guarantee to use random seed in training.
std::random_device rnd; int seed_data = 0;
std::minstd_rand engine;
int seed_data;
if (seed) { if (seed) {
seed_data = *(seed->data<int>()); seed_data = *(seed->data<int>());
} else { } else {
seed_data = seed_data =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd(); context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
} }
engine.seed(seed_data); auto engine = framework::GetCPURandomEngine(seed_data);
std::uniform_real_distribution<float> dist(0, 1); std::uniform_real_distribution<float> dist(0, 1);
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
float cur_random = if (dist(*engine) < dropout_prob) {
init_generator_py
? dist(framework::Generator::GetInstance()->GetCPUEngine())
: dist(engine);
if (cur_random < dropout_prob) {
mask_data[i] = 0; mask_data[i] = 0;
y_data[i] = 0; y_data[i] = 0;
} else { } else {
......
...@@ -26,14 +26,34 @@ namespace operators { ...@@ -26,14 +26,34 @@ namespace operators {
template <typename T> template <typename T>
struct FloorDivFunctor { struct FloorDivFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { inline HOSTDEVICE T operator()(T a, T b) const {
return static_cast<T>(floor(a / b)); #ifdef __CUDA_ARCH__
if (b == 0) {
printf("Error: Divide by zero encounter in floor_divide\n");
asm("trap;");
}
#else
if (b == 0)
PADDLE_THROW(platform::errors::InvalidArgument(
"Divide by zero encounter in floor_divide"));
#endif
return static_cast<T>(std::trunc(a / b));
} }
}; };
template <typename T> template <typename T>
struct InverseFloorDivFunctor { struct InverseFloorDivFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { inline HOSTDEVICE T operator()(T a, T b) const {
return static_cast<T>(floor(b / a)); #ifdef __CUDA_ARCH__
if (a == 0) {
printf("Error: Divide by zero encounter in floor_divide\n");
asm("trap;");
}
#else
if (a == 0)
PADDLE_THROW(platform::errors::InvalidArgument(
"Divide by zero encounter in floor_divide"));
#endif
return static_cast<T>(std::trunc(b / a));
} }
}; };
......
...@@ -24,13 +24,19 @@ namespace operators { ...@@ -24,13 +24,19 @@ namespace operators {
template <typename T> template <typename T>
struct ModFunctor { struct ModFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a % b; } inline HOSTDEVICE T operator()(T a, T b) const {
T res = a % b;
if ((res != 0) && ((res < 0) != (b < 0))) res += b;
return res;
}
}; };
template <typename T> template <typename T>
struct ModFunctorFP { struct ModFunctorFP {
inline HOSTDEVICE T operator()(T a, T b) const { inline HOSTDEVICE T operator()(T a, T b) const {
return fmod(b + fmod(a, b), b); T res = fmod(a, b);
if ((res != 0) && ((b < 0) != (res < 0))) res += b;
return res;
} }
}; };
......
...@@ -22,15 +22,20 @@ namespace operators { ...@@ -22,15 +22,20 @@ namespace operators {
template <typename T> template <typename T>
struct PowFunctor { struct PowFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { inline HOSTDEVICE T operator()(T a, T b) const {
#ifdef __CUDA_ARCH__ // TODO(wujionghao): A potential speed improvement is supporting different
// On CUDAPlace, std::pow(3, 1) calls pow(float, float), and // types in C++.
// it will return a float number like 2.99... , which floor to 2 // #ifdef __CUDA_ARCH__
// when cast to int by default and it is wrong. // // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
// Use llrint to cast it to the nearest integer, which is 3. // // it will return a float number like 2.99... , which floor to 2
// // when cast to int by default and it is wrong.
// // Use llrint to cast it to the nearest integer, which is 3.
// if (std::is_integral<T>::value) {
// return std::llrint(std::pow(a, b));
// }
// #endif
if (std::is_integral<T>::value) { if (std::is_integral<T>::value) {
return std::llrint(std::pow(a, b)); return std::llrint(std::pow(a, b));
} }
#endif
return std::pow(a, b); return std::pow(a, b);
} }
}; };
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -152,3 +152,7 @@ REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>, ...@@ -152,3 +152,7 @@ REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<int>, ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<uint8_t>, ops::GatherGradientOpKernel<uint8_t>,
ops::GatherGradientOpKernel<int64_t>); ops::GatherGradientOpKernel<int64_t>);
REGISTER_OP_VERSION(gather)
.AddCheckpoint(R"ROC(upgrad gather, add attribut [axis])ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"axis", "Specify the axis of gather operation.", {}));
...@@ -39,26 +39,14 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -39,26 +39,14 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
tensor->Resize(shape); tensor->Resize(shape);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
auto engine = framework::GetCPURandomEngine(seed);
if (framework::Generator::GetInstance()->is_init_py) { for (int64_t i = 0; i < size; ++i) {
std::mt19937_64& gen_engine = data[i] = dist(*engine);
framework::Generator::GetInstance()->GetCPUEngine();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(gen_engine);
}
} else {
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
} }
} }
}; }; // namespace operators
template <typename T> template <typename T>
class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -111,12 +111,11 @@ __global__ void KernelPool2DGrad( ...@@ -111,12 +111,11 @@ __global__ void KernelPool2DGrad(
int phstart, phend; int phstart, phend;
int pwstart, pwend; int pwstart, pwend;
if (adaptive) { if (adaptive) {
phstart = h_offset * output_height / input_height; phstart = AdaptStartIndex(h_offset, output_height, input_height);
phend = phend = AdaptEndIndex(h_offset, output_height, input_height);
min((h_offset + 1) * output_height / input_height + 1, output_height);
pwstart = w_offset * output_width / input_width; pwstart = AdaptStartIndex(w_offset, output_width, input_width);
pwend = pwend = AdaptEndIndex(w_offset, output_width, input_width);
min((w_offset + 1) * output_width / input_width + 1, output_width);
} else { } else {
phstart = (h_offset < ksize_height) phstart = (h_offset < ksize_height)
? 0 ? 0
...@@ -159,6 +158,7 @@ __global__ void KernelPool2DGrad( ...@@ -159,6 +158,7 @@ __global__ void KernelPool2DGrad(
pool_size = exclusive ? (hend - hstart) * (wend - wstart) pool_size = exclusive ? (hend - hstart) * (wend - wstart)
: ksize_height * ksize_width; : ksize_height * ksize_width;
} }
int output_sub_idx = channel_last int output_sub_idx = channel_last
? (ph * output_width + pw) * channels + offsetC ? (ph * output_width + pw) * channels + offsetC
: ph * output_width + pw; : ph * output_width + pw;
...@@ -689,15 +689,14 @@ __global__ void KernelPool3DGrad( ...@@ -689,15 +689,14 @@ __global__ void KernelPool3DGrad(
int phstart, phend; int phstart, phend;
int pwstart, pwend; int pwstart, pwend;
if (adaptive) { if (adaptive) {
pdstart = d_offset * output_depth / input_depth; pdstart = AdaptStartIndex(d_offset, output_depth, input_depth);
pdend = pdend = AdaptEndIndex(d_offset, output_depth, input_depth);
min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
phstart = h_offset * output_height / input_height; phstart = AdaptStartIndex(h_offset, output_height, input_height);
phend = phend = AdaptEndIndex(h_offset, output_height, input_height);
min((h_offset + 1) * output_height / input_height + 1, output_height);
pwstart = w_offset * output_width / input_width; pwstart = AdaptStartIndex(w_offset, output_width, input_width);
pwend = pwend = AdaptEndIndex(w_offset, output_width, input_width);
min((w_offset + 1) * output_width / input_width + 1, output_width);
} else { } else {
pdstart = (d_offset < ksize_depth) pdstart = (d_offset < ksize_depth)
? 0 ? 0
......
...@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/operators/math/sampler.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <iostream> #include <iostream>
#include <queue> #include <queue>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
namespace paddle { namespace paddle {
...@@ -28,22 +31,17 @@ Sampler::~Sampler() {} ...@@ -28,22 +31,17 @@ Sampler::~Sampler() {}
UniformSampler::UniformSampler(int64_t range, unsigned int seed) UniformSampler::UniformSampler(int64_t range, unsigned int seed)
: Sampler(range, seed), inv_range_(1.0 / (range + 1)) { : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
random_engine_ = std::make_shared<std::mt19937_64>(seed_); random_engine_ = framework::GetCPURandomEngine(seed_);
dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range); dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
} }
int64_t UniformSampler::Sample() const { int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
return framework::Generator::GetInstance()->is_init_py
? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
: (*dist_)(*random_engine_);
// return (*dist_)(*random_engine_);
}
float UniformSampler::Probability(int64_t value) const { return inv_range_; } float UniformSampler::Probability(int64_t value) const { return inv_range_; }
LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed) LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
: Sampler(range, seed), log_range_(log(range + 1)) { : Sampler(range, seed), log_range_(log(range + 1)) {
random_engine_ = std::make_shared<std::mt19937_64>(seed_); random_engine_ = framework::GetCPURandomEngine(seed_);
dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1); dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
} }
...@@ -52,10 +50,7 @@ int64_t LogUniformSampler::Sample() const { ...@@ -52,10 +50,7 @@ int64_t LogUniformSampler::Sample() const {
// inverse_transform_sampling method // inverse_transform_sampling method
// More details: // More details:
// https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/ // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
auto cur_random = auto cur_random = (*dist_)(*random_engine_);
framework::Generator::GetInstance()->is_init_py
? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
: (*dist_)(*random_engine_);
const int64_t value = static_cast<int64_t>(exp(cur_random * log_range_)) - 1; const int64_t value = static_cast<int64_t>(exp(cur_random * log_range_)) - 1;
// Mathematically, value should be <= range_, but might not be due to some // Mathematically, value should be <= range_, but might not be due to some
// floating point roundoff, so we mod by range_. // floating point roundoff, so we mod by range_.
...@@ -74,7 +69,7 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities, ...@@ -74,7 +69,7 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
const int *alias, const float *alias_probabilities, const int *alias, const float *alias_probabilities,
unsigned int seed) unsigned int seed)
: Sampler(range, seed) { : Sampler(range, seed) {
random_engine_ = std::make_shared<std::mt19937>(seed_); random_engine_ = framework::GetCPURandomEngine(seed_);
real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1); real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range); int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
...@@ -84,14 +79,8 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities, ...@@ -84,14 +79,8 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
} }
int64_t CustomSampler::Sample() const { int64_t CustomSampler::Sample() const {
auto index = auto index = (*int_dist_)(*random_engine_);
framework::Generator::GetInstance()->is_init_py auto p = (*real_dist_)(*random_engine_);
? (*int_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
: (*int_dist_)(*random_engine_);
auto p =
framework::Generator::GetInstance()->is_init_py
? (*real_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
: (*real_dist_)(*random_engine_);
if (p > alias_probs_[index]) { if (p > alias_probs_[index]) {
int alias = alias_[index]; int alias = alias_[index];
......
...@@ -26,8 +26,8 @@ namespace math { ...@@ -26,8 +26,8 @@ namespace math {
// TODO(wanghaoshuang): Support for GPU // TODO(wanghaoshuang): Support for GPU
/** /**
* Sample integers from [0, range). * Sample integers from [0, range).
*/ */
class Sampler { class Sampler {
public: public:
explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) { explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
...@@ -117,7 +117,7 @@ class CustomSampler : public Sampler { ...@@ -117,7 +117,7 @@ class CustomSampler : public Sampler {
const int* alias_; const int* alias_;
const float* probs_; const float* probs_;
const int exceptional_val = -1; const int exceptional_val = -1;
std::shared_ptr<std::mt19937> random_engine_; std::shared_ptr<std::mt19937_64> random_engine_;
std::shared_ptr<std::uniform_real_distribution<>> real_dist_; std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
std::shared_ptr<std::uniform_int_distribution<>> int_dist_; std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
}; };
......
...@@ -72,7 +72,7 @@ static mkldnn::memory::data_type GetDstType(bool is_int8, ...@@ -72,7 +72,7 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
return dst_dt; return dst_dt;
} }
template <typename T> template <typename T, typename K, typename T_out>
class ConvMKLDNNHandlerT class ConvMKLDNNHandlerT
: public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward> { : public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward> {
public: public:
...@@ -227,7 +227,7 @@ class ConvMKLDNNHandlerT ...@@ -227,7 +227,7 @@ class ConvMKLDNNHandlerT
platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(), platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any); MKLDNNMemoryFormat::any);
const auto dst_md = platform::MKLDNNMemDesc( const auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
: mkldnn::prop_kind::forward_training; : mkldnn::prop_kind::forward_training;
...@@ -313,29 +313,29 @@ class ConvMKLDNNHandlerT ...@@ -313,29 +313,29 @@ class ConvMKLDNNHandlerT
if (is_test && weights_mem_p) { if (is_test && weights_mem_p) {
return weights_mem_p; return weights_mem_p;
} else { } else {
const T* filter_data = filter->data<T>(); const K* filter_data = filter->data<K>();
auto weights_tz = framework::vectorize(filter->dims()); auto weights_tz = framework::vectorize(filter->dims());
GetWeightsTz(weights_tz, groups); GetWeightsTz(weights_tz, groups);
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), weights_tz, platform::MKLDNNGetDataType<K>(),
GetWeightsFormat(filter->format(), groups, is_conv3d)); GetWeightsFormat(filter->format(), groups, is_conv3d));
return this->AcquireMemoryWithReorder( return this->AcquireMemoryWithReorder(
user_src_md, this->fwd_pd_->weights_desc(), user_src_md, this->fwd_pd_->weights_desc(),
to_void_cast<T>(filter_data), "@weights_mem_p", is_test); to_void_cast<K>(filter_data), "@weights_mem_p", is_test);
} }
} }
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder( std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
const framework::Tensor* bias, const bool is_test) { const framework::Tensor* bias, const bool is_test) {
const T* bias_data = bias->data<T>(); const K* bias_data = bias->data<K>();
auto user_bias_md = platform::MKLDNNMemDesc( auto user_bias_md = platform::MKLDNNMemDesc(
framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<T>(), framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<K>(),
MKLDNNMemoryFormat::x); MKLDNNMemoryFormat::x);
return this->AcquireMemoryWithReorder( return this->AcquireMemoryWithReorder(
user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<T>(bias_data), user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<K>(bias_data),
"@bias_mem_p", is_test); "@bias_mem_p", is_test);
} }
...@@ -358,14 +358,14 @@ class ConvMKLDNNHandlerT ...@@ -358,14 +358,14 @@ class ConvMKLDNNHandlerT
if (residual_param->format() != if (residual_param->format() !=
platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) { platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) {
auto residual_memory_p = this->AcquireResidualMemory(residual_param); auto residual_memory_p = this->AcquireResidualMemory(residual_param);
dst_memory_p = this->AcquireDstMemory(output); dst_memory_p = this->template AcquireDstMemory<T_out>(output);
this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst"); this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst");
} else { } else {
// Changing ShareDataWith to TensorCopy results in performance drop // Changing ShareDataWith to TensorCopy results in performance drop
// on ResNet architectures // on ResNet architectures
// (https://github.com/PaddlePaddle/Paddle/issues/22964) // (https://github.com/PaddlePaddle/Paddle/issues/22964)
output->ShareDataWith(*residual_param); output->ShareDataWith(*residual_param);
dst_memory_p = this->AcquireDstMemory(output); dst_memory_p = this->template AcquireDstMemory<T_out>(output);
} }
return dst_memory_p; return dst_memory_p;
} }
...@@ -381,7 +381,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -381,7 +381,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bool is_INT8 = bool is_INT8 =
std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value; std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
if (!is_INT8) { if (!is_INT8) {
ComputeFP32(ctx); ComputeFP32<float>(ctx);
} else { } else {
std::string fuse_activation = ctx.Attr<std::string>("fuse_activation"); std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection"); bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
...@@ -399,6 +399,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -399,6 +399,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} }
} }
template <typename T_out>
void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>(); ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
...@@ -414,7 +415,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -414,7 +415,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr; ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
ConvMKLDNNHandlerT<T> handler( ConvMKLDNNHandlerT<T, K, T_out> handler(
ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias, ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
output, ctx.InputName("Input") + ctx.InputName("Filter")); output, ctx.InputName("Input") + ctx.InputName("Filter"));
...@@ -429,7 +430,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -429,7 +430,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dst_memory_p = dst_memory_p =
handler.AcquireDstMemoryWithResidual(output, residual_param); handler.AcquireDstMemoryWithResidual(output, residual_param);
} else { } else {
dst_memory_p = handler.AcquireDstMemory(output); dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
} }
auto conv_p = handler.AcquireForwardPrimitive(); auto conv_p = handler.AcquireForwardPrimitive();
......
...@@ -44,6 +44,7 @@ class FCPrimitiveFactory { ...@@ -44,6 +44,7 @@ class FCPrimitiveFactory {
void ExecuteFcPrimitive(const LoDTensor* input, const Tensor* weights, void ExecuteFcPrimitive(const LoDTensor* input, const Tensor* weights,
const Tensor* bias, LoDTensor* output, const Tensor* bias, LoDTensor* output,
const MKLDNNDeviceContext& dev_ctx,
const ExecutionContext& ctx) { const ExecutionContext& ctx) {
RecomputeOutputDims(ctx, input, weights, output); RecomputeOutputDims(ctx, input, weights, output);
// If primitive has already been created and cached, don't create new one, // If primitive has already been created and cached, don't create new one,
...@@ -74,8 +75,8 @@ class FCPrimitiveFactory { ...@@ -74,8 +75,8 @@ class FCPrimitiveFactory {
"input format is equal to ncw.")); "input format is equal to ncw."));
} }
// Transform weights to default MKL-DNN format weights_ = CreateWeightsMemory(weights);
weights_ = TransposeWeights(weights);
// Since MKL-DNN has a lot of limitations on what the input/weights/output // Since MKL-DNN has a lot of limitations on what the input/weights/output
// dimensions should be, to simplify the code, the creation of primitive // dimensions should be, to simplify the code, the creation of primitive
// descriptor has been divided into separate cases, based on the number // descriptor has been divided into separate cases, based on the number
...@@ -112,10 +113,13 @@ class FCPrimitiveFactory { ...@@ -112,10 +113,13 @@ class FCPrimitiveFactory {
// Quantize weights and reorder to format chosen by FC primitive descriptor. // Quantize weights and reorder to format chosen by FC primitive descriptor.
QuantizeWeights(ctx, fc_prim_desc->weights_desc()); QuantizeWeights(ctx, fc_prim_desc->weights_desc());
bias_ = CreateMemory<float>(fc_prim_desc->bias_desc(), bias); bias_ = CreateMemoryToBeCached<float>(fc_prim_desc->bias_desc(), bias);
// If int8 is desired, quantize bias into 32-bit signed int // If int8 is desired, quantize bias into 32-bit signed int
QuantizeBias(*fc_prim_desc, ctx); QuantizeBias(*fc_prim_desc, ctx);
// Store weights and bias in the mkldnn cache
CacheWeightsAndBias(dev_ctx, ctx);
// Based on format determined by inner_product, create output in desired // Based on format determined by inner_product, create output in desired
// memory format // memory format
output_ = CreateDstMemory(*fc_prim_desc, ctx, output); output_ = CreateDstMemory(*fc_prim_desc, ctx, output);
...@@ -262,14 +266,15 @@ class FCPrimitiveFactory { ...@@ -262,14 +266,15 @@ class FCPrimitiveFactory {
} }
// Convert data from one data format to another // Convert data from one data format to another
mkldnn::memory Reorder(const memory::desc& src_desc, std::shared_ptr<mkldnn::memory> Reorder(const memory::desc& src_desc,
const memory::desc& dst_desc, void* src_data) { const memory::desc& dst_desc,
void* src_data) {
auto src_mem = memory(src_desc, engine_, src_data); auto src_mem = memory(src_desc, engine_, src_data);
auto dst_mem = memory(dst_desc, engine_); auto dst_mem = std::make_shared<memory>(dst_desc, engine_);
auto reorder = mkldnn::reorder(src_mem, dst_mem); auto reorder = mkldnn::reorder(src_mem, *dst_mem);
mkldnn::stream astream(engine_); mkldnn::stream astream(engine_);
reorder.execute(astream, src_mem, dst_mem); reorder.execute(astream, src_mem, *dst_mem);
astream.wait(); astream.wait();
return dst_mem; return dst_mem;
...@@ -277,9 +282,10 @@ class FCPrimitiveFactory { ...@@ -277,9 +282,10 @@ class FCPrimitiveFactory {
// Convert data from one data format to another and rescale it. // Convert data from one data format to another and rescale it.
// If the desired data type is (un)signed int8, quantization occurs here. // If the desired data type is (un)signed int8, quantization occurs here.
mkldnn::memory Reorder(const memory& src_mem, const memory::desc& dst_md, std::shared_ptr<mkldnn::memory> ReorderWithScale(
const std::vector<float>& scale_data) { const std::shared_ptr<memory> src_mem, const memory::desc& dst_md,
mkldnn::memory dst_mem = mkldnn::memory(dst_md, engine_); const std::vector<float>& scale_data) {
auto dst_mem = std::make_shared<mkldnn::memory>(dst_md, engine_);
mkldnn::primitive_attr attributes; mkldnn::primitive_attr attributes;
// According to MKL-DNN's documentation mask determines along which // According to MKL-DNN's documentation mask determines along which
// dimensions should the scale be applied. // dimensions should the scale be applied.
...@@ -289,11 +295,11 @@ class FCPrimitiveFactory { ...@@ -289,11 +295,11 @@ class FCPrimitiveFactory {
// becuase we perform per-output-channel quantization // becuase we perform per-output-channel quantization
int mask = CreateMask(0, scale_data.size() > 1); int mask = CreateMask(0, scale_data.size() > 1);
attributes.set_output_scales(mask, scale_data); attributes.set_output_scales(mask, scale_data);
auto reorder = mkldnn::reorder(src_mem, dst_mem, attributes); auto reorder = mkldnn::reorder(*src_mem, *dst_mem, attributes);
mkldnn::stream astream(engine_); mkldnn::stream astream(engine_);
reorder.execute(astream, reorder.execute(astream,
{{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}}); {{MKLDNN_ARG_FROM, *src_mem}, {MKLDNN_ARG_TO, *dst_mem}});
astream.wait(); astream.wait();
return dst_mem; return dst_mem;
...@@ -323,16 +329,38 @@ class FCPrimitiveFactory { ...@@ -323,16 +329,38 @@ class FCPrimitiveFactory {
return memory(desc, engine_, data); return memory(desc, engine_, data);
} }
// Transpose weights through MKL-DNN's reorder from io to oi format. template <typename T>
mkldnn::memory TransposeWeights(const Tensor* weights) { std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
const mkldnn::memory::desc& desc, const Tensor* tensor) {
return CreateMemoryToBeCached(desc,
platform::to_void_cast<T>(tensor->data<T>()));
}
std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
const mkldnn::memory::desc& desc, void* data) {
return std::make_shared<memory>(desc, engine_, data);
}
// Create weights memory and transform to default MKL-DNN format
std::shared_ptr<mkldnn::memory> CreateWeightsMemory(const Tensor* weights) {
auto dims = framework::vectorize(weights->dims()); auto dims = framework::vectorize(weights->dims());
std::swap(dims[0], dims[1]); // Correct output dimensions std::swap(dims[0], dims[1]); // Correct output dimensions
auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io); auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io);
auto dst_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oi); auto dst_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oi);
// Transpose weights through MKL-DNN's reorder from io to oi format.
return Reorder(src_desc, dst_desc, return Reorder(src_desc, dst_desc,
platform::to_void_cast<float>(weights->data<float>())); platform::to_void_cast<float>(weights->data<float>()));
} }
void CacheWeightsAndBias(const MKLDNNDeviceContext& dev_ctx,
const ExecutionContext& ctx) {
const std::string key = platform::CreateKey(platform::ThreadIDasStr());
const std::string weights_key = key + ctx.InputName("W");
const std::string bias_key = key + ctx.InputName("Bias");
dev_ctx.SetBlob(weights_key, weights_);
dev_ctx.SetBlob(bias_key, bias_);
}
// Compute the bias scales so that its values correspond to the // Compute the bias scales so that its values correspond to the
// scale of data being an output of weights and input multiplication // scale of data being an output of weights and input multiplication
std::vector<float> ComputeBiasScales(const ExecutionContext& ctx) { std::vector<float> ComputeBiasScales(const ExecutionContext& ctx) {
...@@ -388,14 +416,14 @@ class FCPrimitiveFactory { ...@@ -388,14 +416,14 @@ class FCPrimitiveFactory {
} }
void QuantizeWeights(const ExecutionContext& ctx, memory::desc dst) { void QuantizeWeights(const ExecutionContext& ctx, memory::desc dst) {
weights_ = weights_ = ReorderWithScale(weights_, dst,
Reorder(*weights_, dst, ctx.Attr<std::vector<float>>("Scale_weights")); ctx.Attr<std::vector<float>>("Scale_weights"));
} }
void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc, void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc,
const ExecutionContext& ctx) { const ExecutionContext& ctx) {
auto bias_scales = ComputeBiasScales(ctx); auto bias_scales = ComputeBiasScales(ctx);
bias_ = Reorder(*bias_, fc_prim_desc.bias_desc(), bias_scales); bias_ = ReorderWithScale(bias_, fc_prim_desc.bias_desc(), bias_scales);
} }
// Fuse relu into FC with activation type attribute has been set to 'relu' // Fuse relu into FC with activation type attribute has been set to 'relu'
...@@ -463,10 +491,10 @@ class FCPrimitiveFactory { ...@@ -463,10 +491,10 @@ class FCPrimitiveFactory {
private: private:
const mkldnn::engine& engine_; const mkldnn::engine& engine_;
boost::optional<memory> bias_;
boost::optional<memory> input_; boost::optional<memory> input_;
boost::optional<memory> output_; boost::optional<memory> output_;
boost::optional<memory> weights_; std::shared_ptr<memory> bias_;
std::shared_ptr<memory> weights_;
boost::optional<inner_product_forward> fc_; boost::optional<inner_product_forward> fc_;
}; };
...@@ -476,19 +504,13 @@ class FCPrimitiveFactory { ...@@ -476,19 +504,13 @@ class FCPrimitiveFactory {
template <typename T_in, typename T_w, typename T_out> template <typename T_in, typename T_w, typename T_out>
static std::shared_ptr<FCPrimitiveFactory<T_in, T_w, T_out>> static std::shared_ptr<FCPrimitiveFactory<T_in, T_w, T_out>>
GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx, GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
const ExecutionContext& ctx, const Tensor* input, const std::string& key) {
const Tensor* weights,
const mkldnn::engine& mkldnn_engine) {
const std::string key = platform::CreateKey(
platform::ThreadIDasStr(), input->format(), input->dims()[0],
framework::vectorize<int>(weights->dims()), ctx.OutputName("Out"));
auto prim_creator = auto prim_creator =
std::static_pointer_cast<FCPrimitiveFactory<T_in, T_w, T_out>>( std::static_pointer_cast<FCPrimitiveFactory<T_in, T_w, T_out>>(
dev_ctx.GetBlob(key)); dev_ctx.GetBlob(key));
if (prim_creator == nullptr) { if (prim_creator == nullptr) {
prim_creator = prim_creator = std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(
std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(mkldnn_engine); dev_ctx.GetEngine());
dev_ctx.SetBlob(key, prim_creator); dev_ctx.SetBlob(key, prim_creator);
} }
...@@ -498,24 +520,24 @@ GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx, ...@@ -498,24 +520,24 @@ GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
// Choose appropriate primitive factory implementation based on inferred // Choose appropriate primitive factory implementation based on inferred
// output type (uint8, int8 or float). // output type (uint8, int8 or float).
template <typename T_in, typename T_w> template <typename T_in, typename T_w>
static void ExecuteFc(const MKLDNNDeviceContext& dev_ctx, static void ExecuteFc(const ExecutionContext& ctx, const LoDTensor* input,
const ExecutionContext& ctx, const LoDTensor* input,
const Tensor* w, const Tensor* bias, LoDTensor* output, const Tensor* w, const Tensor* bias, LoDTensor* output,
const mkldnn::engine& mkldnn_engine, bool fuse_relu, bool fuse_relu, bool force_fp32_output) {
bool force_fp32_output) { auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const std::string prim_key = platform::CreateKey(
platform::ThreadIDasStr(), input->format(), input->dims()[0],
framework::vectorize<int>(w->dims()), ctx.OutputName("Out"));
constexpr bool is_int8 = constexpr bool is_int8 =
std::is_same<T_in, int8_t>::value || std::is_same<T_in, uint8_t>::value; std::is_same<T_in, int8_t>::value || std::is_same<T_in, uint8_t>::value;
if (!is_int8 || force_fp32_output) { if (!is_int8 || force_fp32_output) {
GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, ctx, input, w, mkldnn_engine) GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, prim_key)
->ExecuteFcPrimitive(input, w, bias, output, ctx); ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
} else if (fuse_relu) { } else if (fuse_relu) {
GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, ctx, input, w, GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, prim_key)
mkldnn_engine) ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
->ExecuteFcPrimitive(input, w, bias, output, ctx);
} else { } else {
GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, ctx, input, w, GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, prim_key)
mkldnn_engine) ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
->ExecuteFcPrimitive(input, w, bias, output, ctx);
} }
} }
...@@ -526,9 +548,6 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> { ...@@ -526,9 +548,6 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
platform::is_cpu_place(ctx.GetPlace()), true, platform::is_cpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace.")); platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace."));
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto input = ctx.Input<LoDTensor>("Input"); auto input = ctx.Input<LoDTensor>("Input");
auto w = ctx.Input<Tensor>("W"); auto w = ctx.Input<Tensor>("W");
auto bias = ctx.Input<Tensor>("Bias"); auto bias = ctx.Input<Tensor>("Bias");
...@@ -537,8 +556,8 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> { ...@@ -537,8 +556,8 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu"; bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
bool force_fp32_output = ctx.Attr<bool>("force_fp32_output"); bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
ExecuteFc<T_in, T_w>(dev_ctx, ctx, input, w, bias, output, mkldnn_engine, ExecuteFc<T_in, T_w>(ctx, input, w, bias, output, fuse_relu,
fuse_relu, force_fp32_output); force_fp32_output);
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/fill_constant_op.h" #include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/operators/mean_op.h" #include "paddle/fluid/operators/mean_op.h"
...@@ -35,23 +36,11 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -35,23 +36,11 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
int64_t size = tensor->numel(); int64_t size = tensor->numel();
std::normal_distribution<T> dist(mean, std); std::normal_distribution<T> dist(mean, std);
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
auto engine = framework::GetCPURandomEngine(seed);
if (framework::Generator::GetInstance()->is_init_py) { for (int64_t i = 0; i < size; ++i) {
std::mt19937_64& gen_engine = data[i] = dist(*engine);
framework::Generator::GetInstance()->GetCPUEngine();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(gen_engine);
}
} else {
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
} }
tensor->set_layout(DataLayout::kMKLDNN); tensor->set_layout(DataLayout::kMKLDNN);
......
...@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel { ...@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of AdadeltaOp should not be null."); platform::errors::InvalidArgument(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(Param) of AdadeltaOp should not be null."));
"Input(Grad) of AdadeltaOp should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"), platform::errors::InvalidArgument(
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."); "Input(Grad) of AdadeltaOp should not be null."));
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), PADDLE_ENFORCE_EQ(
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); ctx->HasInput("AvgSquaredGrad"), true,
PADDLE_ENFORCE( platform::errors::InvalidArgument(
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("AvgSquaredUpdate"), true,
platform::errors::InvalidArgument(
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front() == ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", true,
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); platform::errors::InvalidArgument(
PADDLE_ENFORCE( "The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(),
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front() == ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", true,
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s",
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), ctx->Inputs("Grad").front(),
"Output(ParamOut) of AdadeltaOp should not be null."); ctx->GetInputsVarType("Grad").front()));
PADDLE_ENFORCE(
ctx->HasOutput("AvgSquaredGradOut"), PADDLE_ENFORCE_EQ(
"Output(AvgSquaredGradOut) of AdadeltaOp should not be null."); ctx->HasOutput("ParamOut"), true,
PADDLE_ENFORCE( platform::errors::InvalidArgument(
ctx->HasOutput("AvgSquaredUpdateOut"), "Output(ParamOut) of AdadeltaOp should not be null."));
"Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."); PADDLE_ENFORCE_EQ(
ctx->HasOutput("AvgSquaredGradOut"), true,
platform::errors::InvalidArgument(
"Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("AvgSquaredUpdateOut"), true,
platform::errors::InvalidArgument(
"Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"), param_dim, ctx->GetInputDim("Grad"),
"param and grad input of AdadeltaOp should have same dimension"); "param and grad input of AdadeltaOp should have same dimension");
PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0, PADDLE_ENFORCE_NE(
"Maybe the Input variable AvgSquaredGrad has not " framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
"been initialized. You may need to confirm if you put " platform::errors::InvalidArgument(
"exe.run(startup_program) after optimizer.minimize " "Maybe the Input variable AvgSquaredGrad has not "
"function."); "been initialized. You may need to confirm if you put "
"exe.run(startup_program) after optimizer.minimize "
"function."));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
"Param and AvgSquaredGrad input of AdadeltaOp " platform::errors::InvalidArgument(
"should have same dimension"); "Param and AvgSquaredGrad input of AdadeltaOp "
"should have same dimension"));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
"Param and AvgSquaredUpdate input of AdadeltaOp " platform::errors::InvalidArgument(
"should have same dimension"); "Param and AvgSquaredUpdate input of AdadeltaOp "
"should have same dimension"));
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100644 更改为 100755
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100644 更改为 100755
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册