diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc index b8ccd8e744dab1b2dcb31551893aa0df0180fbbc..f86b4b706b3e246629ec944e06857b88d3cfaad8 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc @@ -414,6 +414,16 @@ std::future BrpcPsClient::load(uint32_t table_id, return send_cmd(table_id, PS_LOAD_ONE_TABLE, {epoch, mode}); } +std::future BrpcPsClient::Load(const LoadSaveContext &load_context) { + if (load_context.table_id < 0) { + return send_cmd(-1, PS_LOAD_ALL_TABLE, + {load_context.epoch, load_context.mode}); + } else { + return send_cmd(load_context.table_id, PS_LOAD_ONE_TABLE, + {load_context.epoch, load_context.mode}); + } +} + std::future BrpcPsClient::save(const std::string &epoch, const std::string &mode) { VLOG(1) << "BrpcPsClient::save path " << epoch; @@ -427,6 +437,19 @@ std::future BrpcPsClient::save(uint32_t table_id, return send_save_cmd(table_id, PS_SAVE_ONE_TABLE, {epoch, mode}); } +std::future BrpcPsClient::Save(const LoadSaveContext &save_context) { + if (save_context.table_id < 0) { + VLOG(1) << "BrpcPsClient::save path " << save_context.epoch; + return send_save_cmd(-1, PS_SAVE_ALL_TABLE, + {save_context.epoch, save_context.mode}); + } else { + VLOG(1) << "BrpcPsClient::save one table path " << save_context.epoch + << " table_id " << save_context.table_id; + return send_save_cmd(save_context.table_id, PS_SAVE_ONE_TABLE, + {save_context.epoch, save_context.mode}); + } +} + std::future BrpcPsClient::clear() { return send_cmd(-1, PS_CLEAR_ALL_TABLE, {}); } @@ -505,6 +528,44 @@ std::future BrpcPsClient::barrier(size_t table_id, return send_cmd(table_id, PS_BARRIER, {std::to_string(barrier_type)}); } +std::future BrpcPsClient::Pull(RequestContext &pull_context) { + if (pull_context.value_type == Dense) { // pull dense + Region *dense_region = + reinterpret_cast(pull_context.dense_values); + pull_dense(dense_region, pull_context.num, pull_context.table); + } else { // pull sparse + uint64_t *keys = reinterpret_cast(pull_context.keys); + float **select_values = + reinterpret_cast(pull_context.sparse_values); + size_t table_id = pull_context.table; + size_t num = pull_context.num; + bool is_training = pull_context.is_training; + if (pull_context.training_mode == Geo) { // for geo + pull_sparse_param(select_values, table_id, keys, num, is_training); + } else if (pull_context.training_mode == Async) { // for async + pull_sparse(select_values, table_id, keys, num, is_training); + } + } +} + +std::future BrpcPsClient::Push(RequestContext &push_context) { + if (push_context.value_type == Dense) { // push dense + const Region *dense_region = push_context.push_context.push_dense_values; + push_dense(dense_region, push_context.num, push_context.table); + } else { // push sparse + size_t table_id = push_context.table; + size_t num = push_context.num; + bool is_training = push_context.is_training; + if (push_context.training_mode == Geo) { // for geo + // TODO(zhaocaibei) + } else if (push_context.training_mode == Async) { // for async + const uint64_t *keys = push_context.push_context.keys; + const float **update_values = push_context.push_context.push_values; + push_sparse(table_id, keys, update_values, num); + } + } +} + std::future BrpcPsClient::pull_geo_param(size_t table_id, std::vector *values, std::vector *keys, diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h index 59ed59933db868ae4c05b69529a2c12fd0f689e2..8b0cb0741b4004fbad444a9919ec540289067f55 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h @@ -163,12 +163,17 @@ class BrpcPsClient : public PSClient { std::future load(uint32_t table_id, const std::string &epoch, const std::string &mode) override; + std::future Load(const LoadSaveContext &load_context) override; + std::future save(const std::string &epoch, const std::string &mode) override; std::future save(uint32_t table_id, const std::string &epoch, const std::string &mode) override; + virtual std::future Save( + const LoadSaveContext &save_context) override; + std::future clear() override; std::future clear(uint32_t table_id) override; @@ -199,6 +204,10 @@ class BrpcPsClient : public PSClient { const uint64_t *keys, size_t num, bool is_training); + virtual std::future Pull(RequestContext &pull_context) override; + + virtual std::future Push(RequestContext &push_context) override; + virtual std::future print_table_stat(uint32_t table_id); virtual std::future barrier(size_t table_id, uint32_t barrier_type); diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h index 4310c247438ceb9bff541fdd21e00ff70ff7b4fd..d81a3a5df07f1de534cd646138fecc4dc2c970e1 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_server.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.h @@ -51,7 +51,7 @@ class BrpcPsServer : public PSServer { _server.Join(); return 0; } - virtual int32_t port(); + int32_t port(); private: virtual int32_t initialize(); diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h index aee0190850753786ce0f083257458caf50a63d26..a978d97b296b0a529a121fcfb9723639421d1e5e 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h @@ -43,7 +43,7 @@ class GraphBrpcServer : public PSServer { _server.Join(); return 0; } - virtual int32_t port(); + int32_t port(); std::condition_variable *export_cv() { return &cv_; } diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h index 21719fbdbf1d64ad26ae0053b73812440ed08b66..8a2bfbe31602be299366fdcbeb264e45a5c4f703 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.h +++ b/paddle/fluid/distributed/ps/service/ps_client.h @@ -26,6 +26,7 @@ #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/platform/timer.h" namespace paddle { @@ -59,6 +60,41 @@ class PSClientClosure : public google::protobuf::Closure { std::vector>> _promises; }; +struct LoadSaveContext { + int table_id; + std::string epoch; + std::string mode; +}; + +enum TrainingMode { Async = 0, Sync = 1, Geo = 3 }; + +enum TrainingPhase { Init = 0, Train = 1, Save = 2 }; + +// enum ValueType { +// Sparse = 0, +// Dense = 1 +// }; + +struct PushContext { + const uint64_t *keys; + const float **push_values; + const Region *push_dense_values; +}; + +struct RequestContext { + int table; + TrainingMode training_mode; // 1 for async, 2 for geo, 3 for sync + TrainingPhase training_phase; // 1 for init, 2 for train + ValueType value_type; // 1 for sparse, 2 for dense + void *keys; + void **sparse_values; // for sparse values + Region *dense_values; // for dense values + PushContext push_context; + size_t num; + bool is_training; + void *callback; +}; + class PSClient { public: PSClient() {} @@ -86,6 +122,9 @@ class PSClient { // 指定table数据load virtual std::future load(uint32_t table_id, const std::string &epoch, const std::string &mode) = 0; + // context配置load选项 + virtual std::future Load(const LoadSaveContext &load_context) = 0; + // 全量table数据save value_accessor根据mode,可能有不同的save条件 virtual std::future save(const std::string &epoch, const std::string &mode) = 0; @@ -93,6 +132,8 @@ class PSClient { virtual std::future save(uint32_t table_id, const std::string &epoch, const std::string &mode) = 0; + virtual std::future Save(const LoadSaveContext &save_context) = 0; + // 清空table数据 virtual std::future clear() = 0; virtual std::future clear(uint32_t table_id) = 0; @@ -107,6 +148,8 @@ class PSClient { virtual std::future pull_dense(Region *regions, size_t region_num, size_t table_id) = 0; // 保留 + virtual std::future Push(RequestContext &push_context) = 0; + // firstly push dense param for parameter server // this is neccessary because dense weight initialized in trainer on cold // start @@ -117,6 +160,9 @@ class PSClient { virtual std::future push_dense(const Region *regions, size_t region_num, size_t table_id) = 0; + + virtual std::future Pull(RequestContext &pull_context) = 0; + // 使用keys进行pull请求,结果填充values // keys和values的个数均为num个,每个value占用select_size空间 // future结束前keys和values缓冲区不能再次使用 diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc index 972cce135f189bee6dbba9e0b89baa288816827b..9e364b6d3ed7aabe3cd3bc944e697e11ac808a33 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -56,6 +56,19 @@ int32_t PsLocalClient::initialize() { return done(); } +std::future PsLocalClient::Load(const LoadSaveContext& load_context) { + if (load_context.table_id < 0) { + for (auto& it : _table_map) { + load(it.first, load_context.epoch, load_context.mode); + } + return done(); + } else { + auto* table_ptr = table(load_context.table_id); + table_ptr->load(load_context.epoch, load_context.mode); + return done(); + } +} + ::std::future PsLocalClient::save(const std::string& epoch, const std::string& mode) { // TODO @@ -74,6 +87,21 @@ int32_t PsLocalClient::initialize() { return done(); } +::std::future PsLocalClient::Save( + const LoadSaveContext& save_context) { + if (save_context.table_id < 0) { + for (auto& it : _table_map) { + save(it.first, save_context.epoch, save_context.mode); + } + return done(); + } else { + auto* table_ptr = table(save_context.table_id); + table_ptr->flush(); + table_ptr->save(save_context.epoch, save_context.mode); + return done(); + } +} + ::std::future PsLocalClient::clear() { // TODO return done(); @@ -93,6 +121,51 @@ int32_t PsLocalClient::initialize() { return done(); } +::std::future PsLocalClient::Pull(RequestContext& pull_context) { + if (pull_context.value_type == Dense) { // pull dense + Region* dense_region = reinterpret_cast(pull_context.dense_values); + pull_dense(dense_region, pull_context.num, pull_context.table); + } else { // pull sparse + uint64_t* keys = reinterpret_cast(pull_context.keys); + char** select_values = reinterpret_cast(pull_context.sparse_values); + size_t table_id = pull_context.table; + size_t num = pull_context.num; + pull_sparse_ptr(select_values, table_id, keys, num); + } +} + +::std::future PsLocalClient::Push(RequestContext& push_context) { + if (push_context.value_type == Dense) { // push dense + if (push_context.training_phase == Init) { + const Region* regions = push_context.push_context.push_dense_values; + size_t region_num = push_context.num; + push_dense_param(regions, region_num, push_context.table); + } else { + if (push_context.training_mode == Geo) { // geo + float* total_send_data = + reinterpret_cast(push_context.dense_values); + size_t total_send_data_size = push_context.num; + push_dense_raw_gradient(push_context.table, total_send_data, + total_send_data_size, push_context.callback); + } else { // async and sync + const Region* regions = push_context.push_context.push_dense_values; + size_t region_num = push_context.num; + push_dense(regions, region_num, push_context.table); + } + } + } else { // push sparse + if (push_context.training_mode == Async) { + const uint64_t* keys = push_context.push_context.keys; + const float** update_values = push_context.push_context.push_values; + size_t table_id = push_context.table; + size_t num = push_context.num; + push_sparse(table_id, keys, update_values, num); + } else { + // TODO + } + } +} + ::std::future PsLocalClient::pull_dense(Region* regions, size_t region_num, size_t table_id) { diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h index e73974ac562861d86e679ddbc213335d10731281..83ca558e3d2cb1f62235cda06c221b0d9367b043 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.h +++ b/paddle/fluid/distributed/ps/service/ps_local_client.h @@ -39,12 +39,16 @@ class PsLocalClient : public PSClient { virtual ::std::future load(uint32_t table_id, const std::string& epoch, const std::string& mode) override; + virtual std::future Load( + const LoadSaveContext& load_context) override; virtual ::std::future save(const std::string& epoch, const std::string& mode) override; virtual ::std::future save(uint32_t table_id, const std::string& epoch, const std::string& mode) override; + virtual std::future Save( + const LoadSaveContext& save_context) override; virtual ::std::future clear() override; virtual ::std::future clear(uint32_t table_id) override; @@ -55,6 +59,10 @@ class PsLocalClient : public PSClient { virtual ::std::future pull_dense(Region* regions, size_t region_num, size_t table_id); + virtual ::std::future Pull(RequestContext& pull_context) override; + + virtual ::std::future Push(RequestContext& push_context) override; + virtual ::std::future push_dense(const Region* regions, size_t region_num, size_t table_id); diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h index 91f8bc4c9127115c9b5595270973d011778c6262..31b52126fc5767b445dfb605ff46b3fbc63c620c 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_server.h +++ b/paddle/fluid/distributed/ps/service/ps_local_server.h @@ -28,7 +28,6 @@ class PsLocalServer : public PSServer { virtual uint64_t start() { return 0; } virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; } virtual int32_t stop() { return 0; } - virtual int32_t port() { return 0; } virtual int32_t configure( const PSParameter &config, PSEnvironment &env, size_t server_rank, const std::vector &server_sub_program = {}) { diff --git a/paddle/fluid/distributed/ps/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc index 5f1974e3e610c6772457514759bff83db944bf52..893f671359e40ce632185c78bade16404d23afc0 100644 --- a/paddle/fluid/distributed/ps/service/server.cc +++ b/paddle/fluid/distributed/ps/service/server.cc @@ -67,8 +67,6 @@ int32_t PSServer::configure( _config = config.server_param(); _rank = server_rank; _environment = &env; - _shuffled_ins = - paddle::framework::MakeChannel>(); size_t shard_num = env.get_ps_servers().size(); const auto &downpour_param = _config.downpour_server_param(); diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h index 160d4a612829531d619c69a0cd5e9cd091f94868..d2804405b41989cbd9b5bed0afaf6d481d0658db 100644 --- a/paddle/fluid/distributed/ps/service/server.h +++ b/paddle/fluid/distributed/ps/service/server.h @@ -69,11 +69,6 @@ class PSServer { const PSParameter &config, PSEnvironment &env, size_t server_rank, const std::vector &server_sub_program = {}); - // return server_ip - virtual std::string ip() { return butil::my_ip_cstr(); } - // return server_port - virtual int32_t port() = 0; - virtual uint64_t start(const std::string &ip, uint32_t port) = 0; virtual int32_t stop() = 0; @@ -94,15 +89,6 @@ class PSServer { return &_table_map; } - typedef std::function MsgHandlerFunc; - virtual int registe_pserver2pserver_msg_handler(int msg_type, - MsgHandlerFunc handler) { - _msg_handler_map[msg_type] = handler; - return 0; - } - - paddle::framework::Channel> _shuffled_ins; - protected: virtual int32_t initialize() = 0; @@ -111,7 +97,6 @@ class PSServer { ServerParameter _config; PSEnvironment *_environment; std::unordered_map> _table_map; - std::unordered_map _msg_handler_map; protected: std::shared_ptr scope_; diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h index 7c91a6086498037e56f9b89dc13243cfeb827c5c..07c211bb9c12866e3646a0dbdebfba189eb2507e 100644 --- a/paddle/fluid/distributed/ps/table/accessor.h +++ b/paddle/fluid/distributed/ps/table/accessor.h @@ -45,6 +45,17 @@ struct DataConverter { std::string deconverter; }; +struct AccessorInfo { + size_t dim; + size_t size; + size_t select_size; + size_t select_dim; + size_t update_size; + size_t update_dim; + size_t mf_size; + size_t fea_dim; +}; + class ValueAccessor { public: ValueAccessor() {} @@ -68,6 +79,8 @@ class ValueAccessor { } virtual int initialize() = 0; + virtual void GetTableInfo(AccessorInfo& info) = 0; + // value维度 virtual size_t dim() = 0; // value各个维度的size @@ -163,6 +176,7 @@ class ValueAccessor { TableAccessorParameter _config; std::unordered_map> _data_coverter_map; + AccessorInfo _accessor_info; }; REGISTER_PSCORE_REGISTERER(ValueAccessor); } // namespace distributed diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc index 607469e2f7b0d5df79d4cb7477e0eaa3f4a8323a..cc0f5867a3d651bca9323452d1eb97355de4c160 100644 --- a/paddle/fluid/distributed/ps/table/common_dense_table.cc +++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc @@ -128,6 +128,21 @@ int32_t CommonDenseTable::set_global_lr(float* lr) { return 0; } +int32_t CommonDenseTable::Pull(TableContext& context) { + CHECK(context.value_type == Dense); + float* pull_values = context.pull_context.values; + return pull_dense(pull_values, context.num); +} + +int32_t CommonDenseTable::Push(TableContext& context) { + CHECK(context.value_type == Dense); + if (context.pull_context.values != nullptr) { + const float* values = context.push_context.values; + return push_dense(values, context.num); + } + return 0; +} + int32_t CommonDenseTable::pull_dense(float* pull_values, size_t num) { std::copy(values_[param_idx_].begin(), values_[param_idx_].end(), pull_values); diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.h b/paddle/fluid/distributed/ps/table/common_dense_table.h index a4c0f29ddb8770c8adc0d6885929aaac8a028e90..cad49a0a449c4735a74261574436a78789694d9b 100644 --- a/paddle/fluid/distributed/ps/table/common_dense_table.h +++ b/paddle/fluid/distributed/ps/table/common_dense_table.h @@ -40,6 +40,8 @@ class CommonDenseTable : public DenseTable { const std::string& name); virtual int32_t initialize_value(); virtual int32_t initialize_optimizer(); + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); int32_t pull_dense(float* pull_values, size_t num) override; int32_t push_dense_param(const float* values, size_t num) override; int32_t push_dense(const float* values, size_t num) override; diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index 7946569525cc4bb1351046632dfe5894611c4b67..f6f127621b947c41122f7803a90f39b640713b8e 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -454,6 +454,9 @@ class GraphTable : public SparseTable { int32_t get_server_index_by_id(int64_t id); Node *find_node(int64_t id); + virtual int32_t Pull(TableContext &context) { return 0; } + virtual int32_t Push(TableContext &context) { return 0; } + virtual int32_t pull_sparse(float *values, const PullSparseValue &pull_value) { return 0; diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.cc b/paddle/fluid/distributed/ps/table/common_sparse_table.cc index b44d08b937a96c806142f5d7f1ba2ae0bcdb0f5e..45be53335e1a181f7c1e2abb7326ac6b9800703f 100644 --- a/paddle/fluid/distributed/ps/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/common_sparse_table.cc @@ -355,6 +355,32 @@ int32_t CommonSparseTable::pour() { return 0; } +int32_t CommonSparseTable::Pull(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + char** pull_values = context.pull_context.ptr_values; + const uint64_t* keys = context.pull_context.keys; + return pull_sparse_ptr(pull_values, keys, context.num); + } else { + float* pull_values = context.pull_context.values; + const PullSparseValue& pull_value = context.pull_context.pull_value; + return pull_sparse(pull_values, pull_value); + } +} + +int32_t CommonSparseTable::Push(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.pull_context.values != nullptr) { + const float* values = context.push_context.values; + const uint64_t* keys = context.push_context.keys; + return push_sparse(keys, values, context.num); + } else { + const float** values = context.push_context.ptr_values; + const uint64_t* keys = context.push_context.keys; + return push_sparse(keys, values, context.num); + } +} + int32_t CommonSparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { auto shard_num = task_pool_size_; diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h index 82481dcd584e42b9b2bca1bcc5862b361e372b05..138c5447420663eae5ad94ea03a84360a46f8b3d 100644 --- a/paddle/fluid/distributed/ps/table/common_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/common_sparse_table.h @@ -121,6 +121,9 @@ class CommonSparseTable : public SparseTable { virtual int32_t push_dense(const float* values, size_t num) { return 0; } // unused method end + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); + virtual int32_t initialize(); virtual int32_t initialize_shard() { return 0; } virtual int32_t initialize_value(); diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h index bac826dfe0e20b42d5cc47467356bc5614383a44..3d291c0152246bffa748ea57cf1c96eff6f2f343 100644 --- a/paddle/fluid/distributed/ps/table/common_table.h +++ b/paddle/fluid/distributed/ps/table/common_table.h @@ -119,6 +119,9 @@ class BarrierTable : public Table { virtual void *get_shard(size_t shard_idx) { return 0; } + virtual int32_t Pull(TableContext &context) { return 0; } + virtual int32_t Push(TableContext &context) { return 0; } + int32_t pull_dense(float *values, size_t num) override { return 0; } int32_t push_dense(const float *values, size_t num) override { return 0; } diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc index 866bd8114ccea329123e16585c33366e759d5df8..43e143dca901bb8264f666a1e4fd89a52102d894 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc @@ -38,6 +38,16 @@ int CtrCommonAccessor::initialize() { return 0; } +void CtrCommonAccessor::GetTableInfo(AccessorInfo& info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); } size_t CtrCommonAccessor::dim_size(size_t dim) { diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h index 1e31fec04649b19882269fa9cce5f5d7fb4978c1..bc46217955a8a677a9e5e16f740e2636d633908f 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h @@ -126,6 +126,7 @@ class CtrCommonAccessor : public ValueAccessor { virtual int initialize(); virtual ~CtrCommonAccessor() {} + virtual void GetTableInfo(AccessorInfo& info); // value维度 virtual size_t dim(); // value各个维度的size diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc index b07bcf70ad7af416fc66e036c0061f9556cc4eae..bccf1fdebafa03442047048825ef85207711b6b3 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc @@ -37,6 +37,16 @@ int DownpourCtrDoubleAccessor::initialize() { return 0; } +void DownpourCtrDoubleAccessor::GetTableInfo(AccessorInfo& info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + size_t DownpourCtrDoubleAccessor::dim() { auto embedx_dim = _config.embedx_dim(); return DownpourCtrDoubleFeatureValue::dim(embedx_dim); diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h index d7c717ace098821c3434888c9ff0ad699c923867..d7942634e86003c484710aad1d969e4d6371cb7f 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h @@ -168,6 +168,7 @@ class DownpourCtrDoubleAccessor : public ValueAccessor { DownpourCtrDoubleAccessor() {} virtual ~DownpourCtrDoubleAccessor() {} virtual int initialize(); + virtual void GetTableInfo(AccessorInfo& info); // value维度 virtual size_t dim(); // value各个维度的size diff --git a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h index 708f7786bf3b0975791fcc74dddf62d2eb01e450..98e0250acc4d686dbde561ffb03edeb96444c406 100644 --- a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h +++ b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h @@ -58,7 +58,7 @@ struct PullSparseValue { std::vector* offset_shard) const { offset_shard->reserve(numel_ / shard_num + 1); for (int x = 0; x < numel_; ++x) { - if (feasigns_[x] % shard_num == shard_id) { + if (int(feasigns_[x] % shard_num) == shard_id) { offset_shard->push_back(x); } } diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc index 5f22c3a436f1f0b89e6289844a9c56fbe888625d..e8ca7430351de7cbdc1e98607d6d9b884b6a376a 100644 --- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc @@ -37,6 +37,16 @@ int DownpourCtrAccessor::initialize() { return 0; } +void DownpourCtrAccessor::GetTableInfo(AccessorInfo& info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + size_t DownpourCtrAccessor::dim() { auto embedx_dim = _config.embedx_dim(); return DownpourCtrFeatureValue::dim(embedx_dim); diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h index 5de7b12e01f0d6e619ee14b852b7aa308ec3b497..11991ad044ff63353c9a898469ec915163c2dea9 100644 --- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h @@ -160,6 +160,7 @@ class DownpourCtrAccessor : public ValueAccessor { virtual ~DownpourCtrAccessor() {} virtual int initialize(); + virtual void GetTableInfo(AccessorInfo& info); // value维度 virtual size_t dim(); // value各个维度的size diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h index 89c4fc15ae27998da3a3c7c3092baa9eee9846a0..3b43f99543fddabfaa24fc7da562203fc3f0d633 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h @@ -48,6 +48,8 @@ class MemorySparseGeoTable : public SparseTable { virtual int32_t save(const std::string& path, const std::string& param) { return 0; } + virtual int32_t Pull(TableContext& context) { return 0; } + virtual int32_t Push(TableContext& context) { return 0; } virtual int32_t flush() { return 0; } virtual int32_t shrink(const std::string& param) { return 0; } virtual void clear() { return; } diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index 7ce6e9005cf56ca295a6620a209551e303c112f3..98454ca747d314d76bb63706e853ded835df736a 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -390,6 +390,26 @@ std::pair MemorySparseTable::print_table_stat() { return {feasign_size, mf_size}; } +int32_t MemorySparseTable::Pull(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + char** pull_values = context.pull_context.ptr_values; + const uint64_t* keys = context.pull_context.keys; + return pull_sparse_ptr(pull_values, keys, context.num); + } else { + float* pull_values = context.pull_context.values; + const PullSparseValue& pull_value = context.pull_context.pull_value; + return pull_sparse(pull_values, pull_value); + } +} + +int32_t MemorySparseTable::Push(TableContext& context) { + CHECK(context.value_type == Sparse); + + const uint64_t* keys = context.push_context.keys; + return push_sparse(keys, context.push_context.ptr_values, context.num); +} + int32_t MemorySparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { CostTimer timer("pserver_sparse_select_all"); diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h index 5770f25f8f41dec286993d6b586959c8c0d3a0c0..d26c67319760da0496ae8a1c164adf0d5b63b1f2 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h @@ -48,6 +48,9 @@ class MemorySparseTable : public SparseTable { virtual int32_t push_dense(const float* values, size_t num) { return 0; } // unused method end + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); + virtual int32_t initialize(); virtual int32_t initialize_shard() { return 0; } virtual int32_t initialize_value(); diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index 60514b4e19ffaf63f285e25f1355660fabe58d48..5bc58bc5a1108b5f342036d9bd72c96287458401 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -61,6 +61,21 @@ int32_t SSDSparseTable::initialize() { return 0; } +int32_t SSDSparseTable::Pull(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + char** pull_values = context.pull_context.ptr_values; + const uint64_t* keys = context.pull_context.keys; + return pull_sparse_ptr(pull_values, keys, context.num); + } else { + float* pull_values = context.pull_context.values; + const PullSparseValue& pull_value = context.pull_context.pull_value; + return pull_sparse(pull_values, pull_value); + } +} + +int32_t SSDSparseTable::Push(TableContext& context) { return 0; } + int32_t SSDSparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { auto shard_num = task_pool_size_; diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h index f5e8a7067e0e041f9913bef8e43ad8b35bdb2783..3a703d7d966d3e6026d13c0658f5979120cd2073 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h @@ -42,6 +42,9 @@ class SSDSparseTable : public CommonSparseTable { // exchange data virtual int32_t update_table(); + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); + virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys, diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h index da1bb668ccfa3c5f1a4f876a396847b6b3853772..2bd2a42b6c58f0753de86aa4e60ac7e0611bd7f7 100644 --- a/paddle/fluid/distributed/ps/table/table.h +++ b/paddle/fluid/distributed/ps/table/table.h @@ -32,6 +32,30 @@ namespace paddle { namespace distributed { + +enum ValueType { Sparse = 0, Dense = 1 }; + +struct PullContext { + const uint64_t *keys; + const PullSparseValue pull_value; + float *values; + char **ptr_values; +}; + +struct TablePushContext { + const uint64_t *keys; + const float *values; + const float **ptr_values; +}; + +struct TableContext { + ValueType value_type; + PullContext pull_context; + TablePushContext push_context; + size_t num; + bool use_ptr; +}; + class Table { public: Table() {} @@ -39,6 +63,8 @@ class Table { virtual int32_t initialize(const TableParameter &config, const FsClientParameter &fs_config); + virtual int32_t Pull(TableContext &context) = 0; + virtual int32_t Push(TableContext &context) = 0; virtual int32_t pull_dense(float *values, size_t num) = 0; virtual int32_t push_dense(const float *values, size_t num) = 0; // for push global_step diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc index 70a580c1e53a931dc2affd29db01b72691c68a39..8c5349bff832caaa0a1b411723df8b3e9bcdcd4f 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc @@ -20,6 +20,16 @@ namespace distributed { int CommMergeAccessor::initialize() { return 0; } +void CommMergeAccessor::GetTableInfo(AccessorInfo &info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + // value 维度 size_t CommMergeAccessor::dim() { return 0; } diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h index 5041b8fdf8733eff676b5fce1a972e39182df48e..1873b743b44ec736f0470c3eff1f5b0280c235bf 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.h +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h @@ -30,6 +30,7 @@ class CommMergeAccessor : public ValueAccessor { CommMergeAccessor() {} virtual ~CommMergeAccessor() {} virtual int initialize(); + virtual void GetTableInfo(AccessorInfo &info); // value维度 virtual size_t dim(); // value各个维度的size diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h index 64d81327acc55ba0655bfc33efaa0d9d9f59649e..23a62365c0f5a374f3820e2e790e6085cfda1c06 100644 --- a/paddle/fluid/distributed/ps/table/tensor_table.h +++ b/paddle/fluid/distributed/ps/table/tensor_table.h @@ -48,6 +48,8 @@ class TensorTable : public Table { TensorTable() {} virtual ~TensorTable() {} + virtual int32_t Pull(TableContext &context) { return 0; } + virtual int32_t Push(TableContext &context) { return 0; } int32_t pull_dense(float *values, size_t num) override { return 0; } int32_t push_dense(const float *values, size_t num) override { return 0; } diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc index 0588dbdf0fc61298d33eeb6db5b3de91a6de8256..c887cfeb71eef1c8b861b0d5958dca983e9feaaf 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -30,6 +30,32 @@ bool FleetWrapper::is_initialized_ = false; std::shared_ptr FleetWrapper::pserver_ptr_ = NULL; +void FleetWrapper::Stop() { StopServer(); } + +void FleetWrapper::Load(WrapperContext& context) { + auto table_id = context.table_id; + if (table_id >= 0 && context.meta != "") { + LoadSparseOnServer(context.path, context.meta, context.table_id); + return; + } + if (table_id < 0) { // laod all + LoadModel(context.path, context.mode); + } else { // load one table + LoadModelOneTable(table_id, context.path, context.mode); + } + return; +} + +void FleetWrapper::Save(WrapperContext& context) { + auto table_id = context.table_id; + if (table_id < 0) { + SaveModel(context.path, context.mode); + } else { + SaveModelOneTable(table_id, context.path, context.mode); + } + return; +} + void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms, int max_retry) { diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h index a535b8c5bf8f9bf72a2fa895b8a0fd82ffb2e0a3..d68c453c6d51b04131ce562cafddbbdb06ac0356 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.h +++ b/paddle/fluid/distributed/ps/wrapper/fleet.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" #include "paddle/fluid/distributed/ps/service/ps_service/service.h" +#include "paddle/fluid/distributed/ps/wrapper/ps_wrapper.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/framework/io/shell.h" @@ -54,7 +55,7 @@ using framework::Variable; using RpcCtxMap = std::unordered_map; -class FleetWrapper { +class FleetWrapper : public PSWrapper { public: virtual ~FleetWrapper() {} FleetWrapper() { @@ -68,7 +69,13 @@ class FleetWrapper { // pserver request max retry client2client_max_retry_ = 3; } + virtual int32_t Initialize(InitContext& context) { return 0; } + virtual void Stop() override; + + virtual void Load(WrapperContext& context) override; + + virtual void Save(WrapperContext& context) override; // set client to client communication config void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms, int max_retry); diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h index c92835aa995adfd3158fc344b490efb2c3133ec0..ca02ad31195ef2cdee649f5348d3f735c38097b8 100755 --- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h +++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h @@ -1,18 +1,84 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_ -#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_ - -#endif // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" +#include "paddle/fluid/distributed/ps/service/ps_service/service.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/io/shell.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { +class Scope; +class SelectedRows; +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace distributed { + +class PSCore; + +using framework::LoDTensor; +using framework::Scope; +using phi::SelectedRows; +using framework::Variable; + +using RpcCtxMap = std::unordered_map; + +struct WrapperContext { + uint32_t table_id; + const std::string path; + const int mode; + const std::string meta; +}; + +struct InitContext { + const std::vector dev_ids; // for gpu +}; + +class PSWrapper { + public: + virtual ~PSWrapper() {} + PSWrapper() {} + // init server + + virtual int32_t Initialize(InitContext& context) = 0; + + virtual void Stop() = 0; + + virtual void Load(WrapperContext& context) = 0; + + virtual void Save(WrapperContext& context) = 0; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index b8d59e8dd8b4c60e28323955effd232eb2b51945..df2cdc35626a8aa27899f7340fa14285299a11d1 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2032,7 +2032,15 @@ static std::string GenerateSingleOpBase( const char* ATTRS_TEMPLATE = " auto& %s = this->attr_map_;\n"; std::string grad_attrs_str = paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name); - + if (fwd_op_type == "cast") { + // swtich in out dtype + const char* CAST_GRAD = + " auto temp_type = %s[\"in_dtype\"];\n" + " %s[\"in_dtype\"] = %s[\"out_dtype\"];\n" + " %s[\"out_dtype\"] = temp_type;\n"; + grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name, + attrs_name, attrs_name); + } // Handle dynamic grad attributes grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name); generated_grad_function_body += grad_attrs_str; diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 17bc2441488aa3c4fc62a37e825eeb94cafea9bb..ebd3333c5265990a8ae2fb6840113bd0ea4d4766 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -29,132 +29,330 @@ namespace egr { -std::unordered_map getInDegreeMap( - const std::queue& init_queue) { - // Calculate in_degree for each node - // We can completely remove this pass, if in_degree were set during forward - // pass - std::unordered_map node_in_degree_map; +/* +* GeneralGrad is Helpper class to implement custom grad operation between +* outputs and inputs. +* +* **/ +class GeneralGrad { + public: + static GeneralGrad& Instance() { return *general_grad_; } + + // Get inputs's / no_grad_vars's GradNodes and InputMeta Info + void GetTargetNodesInfo( + const std::vector& inputs, + bool is_no_grad_vars) { + std::string msg = is_no_grad_vars ? "no_grad_vars" : "inputs"; + VLOG(6) << "Running in GetTargetNodesInfo."; + if (!inputs.empty()) { + VLOG(6) << msg << " are not empty."; + size_t num_inputs = inputs.size(); + for (size_t i = 0; i < num_inputs; i++) { + AutogradMeta* auto_grad_meta = + EagerUtils::unsafe_autograd_meta(inputs[i]); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + PADDLE_ENFORCE_NOT_NULL(target_node, + paddle::platform::errors::Fatal( + "There is no grad op for %s:[%d] or it's" + "stop_gradient=True.", + msg, i)); + if (is_no_grad_vars) { + (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta; + } else { // normal input + (input_target_nodes_inputmeta_map)[target_node] = auto_grad_meta; + } + } + } + } - // Copy nodes - std::queue queue = init_queue; - std::unordered_set visited; - size_t potential_startup_ops_cnt = queue.size(); - size_t cnt = 0; + // Purify potential_startup_nodes, remove nodes those are the same as + // input_target_nodes + void PurifyPotentialStartUpNodes() { + VLOG(6) << "Running in PurifyPotentialStartUpNodes"; + if (input_target_nodes_inputmeta_map.empty()) return; + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto startup_op : potential_startup_nodes) { + auto iter = input_target_nodes_inputmeta_map.find(startup_op); + if (iter != input_target_nodes_inputmeta_map.end()) { + potential_startup_nodes_to_be_erased.emplace(iter->first); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto nodes : potential_startup_nodes_to_be_erased) { + potential_startup_nodes.erase(nodes); + } + } + } - // Visit each node exactly once in any order - while (!queue.empty()) { - GradNodeBase* node = queue.front(); - queue.pop(); + // Remove some nodes those doesn't need to be + // stored in potential_stop_nodes、potential_startup_nodes + void UpdateGraphInfo() { + // Updated potential_sotp_nodes by depending_nodes, + // make sure the path from root to target_node is ok + std::unordered_set _startup_ops; + VLOG(6) << "Running in UpdateGraphInfo"; + std::queue queue; + for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map) { + queue.emplace(target_nodes_inputmeta_pair.first); + } - if (cnt < potential_startup_ops_cnt) { - if (!node_in_degree_map.count(node)) { - node_in_degree_map[node] = 0; + while (!queue.empty()) { + auto* target_node = queue.front(); + queue.pop(); + if (!(depending_nodes)[target_node].empty()) { + auto precedding_nodes = (depending_nodes)[target_node]; + for (auto pre_nodes : precedding_nodes) { + queue.emplace(pre_nodes); + if (potential_stop_nodes.find(pre_nodes) != + potential_stop_nodes.end()) { + potential_stop_nodes.erase(pre_nodes); + } + } + } else { // startup_ops have no precedding nodes + VLOG(6) << "Emplace _startup_ops"; + _startup_ops.emplace(target_node); } - cnt += 1; } - - if (visited.count(node)) { - continue; + // Purify potential_startup_nodes again, remove some + // potential startup_nodes that unreach to input target nodes + if (!_startup_ops.empty()) { + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto node : potential_startup_nodes) { + if (_startup_ops.count(node) == 0) { + VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; + potential_startup_nodes_to_be_erased.emplace(node); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto node : potential_startup_nodes_to_be_erased) { + VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; + potential_startup_nodes.erase(node); + } + } } - visited.insert(node); + } - PADDLE_ENFORCE_NOT_NULL( - node, - paddle::platform::errors::Fatal( - "We got null node when we traverse the backward graph, and this " - "should not happened please check your code and contact us.")); - // Find and append next nodes - const std::vector>& edges = node->GetEdges(); - for (const auto& edge_list : edges) { - for (const Edge& edge : edge_list) { - GradNodeBase* next_node = edge.GetMutableGradNode().get(); - // Next node could be nullptr if it is leaf tensor with no - // AccumulationNode attached - // Or it could also originated from dispensable inputs - if (!next_node) continue; + // Get Graph Info Betweent input target GradNode and outputs, + // record depending_nodes、potential_stop_nodes、potential_startup_nodes + void GetGraphInfoBetweenTargets(const std::queue& init_queue) { + VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; - // Update in_degree - if (!node_in_degree_map.count(next_node)) - node_in_degree_map[next_node] = 0; - node_in_degree_map[next_node]++; - queue.push(next_node); + // Calculate in_degree for each node + std::unordered_map node_in_degree_map; + + // Copy nodes + std::queue queue = init_queue; + std::unordered_set visited; + + // Visit each node exactly once in any order + while (!queue.empty()) { + GradNodeBase* node = queue.front(); + queue.pop(); + + if (visited.count(node)) { + continue; + } + visited.insert(node); + + // Check node is target_nodes or not, if node is not target_node, + // all the next_node will be marked in potential_stop_nodes + bool is_potential_stop_nodes = + input_target_nodes_inputmeta_map.count(node); + + // Find and append next nodes + const std::vector>& edges = node->GetEdges(); + for (const auto& edge_list : edges) { + for (const Edge& edge : edge_list) { + GradNodeBase* next_node = edge.GetMutableGradNode().get(); + + // Next node could be nullptr if it is leaf tensor with no + // AccumulationNode attached + // Or it could also originated from dispensable inputs + if (!next_node) continue; + + // if node not in input_target_nodes, + // all the next_nodes of current node will be inserted to + // potential_stop_node + if (is_potential_stop_nodes) { + potential_stop_nodes.emplace(next_node); + } + + // Update in_degree + if (!node_in_degree_map.count(next_node)) + node_in_degree_map[next_node] = 0; + node_in_degree_map[next_node]++; + + // Record depending relationship + (depending_nodes)[next_node].emplace(node); + queue.push(next_node); + } } } + // Update Graph Info, remove some nodes in + // potential_stop_nodes、potential_startup_nodes、 + UpdateGraphInfo(); } - return node_in_degree_map; -} -// Remove some nodes those doesn't need to be -// stored in potential_stop_nodes、potential_startup_nodes -void UpdateGraphInfo( - std::unordered_map* - target_nodes_inputmeta_map, - std::unordered_map>* - depending_nodes, - std::unordered_set* potential_stop_nodes, - std::unordered_set* potential_startup_nodes) { - // Updated potential_sotp_nodes by depending_nodes, - // make sure the path from root to target_node is ok - std::unordered_set _startup_ops; - VLOG(6) << "Running in UpdateGraphInfo"; - std::queue queue; - for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) { - queue.emplace(target_nodes_inputmeta_pair.first); + void ModifyReadyQueue(std::queue* queue) { + std::queue tmp_queue; + for (auto nodes : potential_startup_nodes) { + tmp_queue.emplace(nodes); + } + tmp_queue.swap(*queue); } - while (!queue.empty()) { - auto* target_node = queue.front(); - queue.pop(); - if (!(*depending_nodes)[target_node].empty()) { - auto precedding_nodes = (*depending_nodes)[target_node]; - for (auto pre_nodes : precedding_nodes) { - queue.emplace(pre_nodes); - if (potential_stop_nodes->find(pre_nodes) != - potential_stop_nodes->end()) { - potential_stop_nodes->erase(pre_nodes); + // Set result for input target grad_var when potential_startup_nodes is empty + void SetResultForInputTargetVar( + const std::unordered_map>& + node_input_buffers_dict) { + if (potential_startup_nodes.size() == 0) { + for (auto input_target_node : *GetInPutTargetNodesInputMetaMap()) { + // out rank_info of forward op + auto rank_info = input_target_node.second->OutRankInfo(); + auto iter = node_input_buffers_dict.find(input_target_node.first); + if (iter != node_input_buffers_dict.end()) { + auto& target_result = + (iter->second)->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[input_target_node.first] = target_result; } } - } else { // startup_ops have no precedding nodes - VLOG(6) << "Emplace _startup_ops"; - _startup_ops.emplace(target_node); } } - // Purify potential_startup_nodes again, remove some - // potential startup_nodes that unreach to input target nodes - if (!_startup_ops.empty()) { - std::unordered_set potential_startup_nodes_to_be_erased; - for (auto node : *potential_startup_nodes) { - if (_startup_ops.count(node) == 0) { - VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; - potential_startup_nodes_to_be_erased.emplace(node); - } + + // Set input target grad_var from node_input_buffer by inputmeta + void SetResultForInputTargetVar(GradTensorHolder input_buffers, + GradNodeBase* node) { + auto iter = GetInPutTargetNodesInputMetaMap()->find(node); + if (iter != GetInPutTargetNodesInputMetaMap()->end()) { + VLOG(6) << "Get target result by by inputmeta"; + // out rank_info of forward op + auto rank_info = (iter->second)->OutRankInfo(); + // rank_info is a pair, first means slot_id, second means rank. + auto& target_result = + input_buffers.Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[node] = target_result; } - if (!potential_startup_nodes_to_be_erased.empty()) { - for (auto node : potential_startup_nodes_to_be_erased) { - VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; - potential_startup_nodes->erase(node); + } + + std::vector GetResults( + const std::vector& inputs, + bool allow_unused, bool create_graph) { + VLOG(6) << "Running in GetResults"; + if (inputs.empty()) return {}; + + std::vector results; + results.reserve(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto& input = inputs[i]; + AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + auto iter = results_map.find(target_node); + if (iter != results_map.end()) { + // set StopGradient = !create_graph + AutogradMeta* tensor_auto_grad_meta = + EagerUtils::autograd_meta(&(iter->second)); + tensor_auto_grad_meta->SetStopGradient(!create_graph); + results.emplace_back(iter->second); + } else { + PADDLE_ENFORCE_EQ(allow_unused, true, + paddle::platform::errors::InvalidArgument( + "The %d-th input does not appear in the backward " + "graph. Please check the input tensor or set " + "allow_unused=True to get None result.", + i)); + results.emplace_back(); } } + Clear(); + return results; + } + + void PreparedForGeneralGrad( + const std::vector& inputs, + const std::vector& no_grad_vars, + std::queue* queue, + const std::unordered_map>& + node_input_buffers_dict) { + // Get no_grad_vars's GradNodes and InputMeta Info + GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */); + // Get inputs's GradNodes and InputMeta Info + GetTargetNodesInfo(inputs, false /* is_no_grad_vars */); + // Purify potential_startup_ops, remove those nodes that are the same as + // input_target_nodes + PurifyPotentialStartUpNodes(); + // Get Graph Info Betweent input target gradnode and outputs + // Record the depending_nodes and + // potential_stop_nodes、potential_startup_nodes + GetGraphInfoBetweenTargets(*queue); + // Reset queue. Queue is empty only when + // 1.input equals to output. 2.input can not reach to output. + ModifyReadyQueue(queue); + // Set result for input target grad_var when queue is empty + if (queue->empty()) SetResultForInputTargetVar(node_input_buffers_dict); + } + + bool IsPotentialStopNodes(GradNodeBase* node) { + return potential_stop_nodes.count(node); + } + + std::unordered_map* + GetNoGradVarNodesInputMetaMap() { + return &no_grad_var_nodes_inputmeta_map; + } + + std::unordered_map* + GetInPutTargetNodesInputMetaMap() { + return &input_target_nodes_inputmeta_map; + } + + std::unordered_set* GetPotentialStopNodes() { + return &potential_stop_nodes; + } + + std::unordered_set* GetPotentialStartupNodes() { + return &potential_startup_nodes; + } + + void Clear() { + no_grad_var_nodes_inputmeta_map.clear(); + input_target_nodes_inputmeta_map.clear(); + potential_startup_nodes.clear(); + potential_stop_nodes.clear(); + depending_nodes.clear(); + results_map.clear(); } -} -// Get Graph Info Betweent input target gradnode and outputs, -// record depending_nodes、 potential_stop_nodes、potential_startup_nodes -void GetGraphInfoBetweenTargets( - const std::queue& init_queue, - std::unordered_map* - input_target_nodes_inputmeta_map, - std::unordered_map>* - depending_nodes, - std::unordered_set* potential_stop_nodes, - std::unordered_set* potential_startup_nodes) { - if (input_target_nodes_inputmeta_map->empty()) return; - - VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; + private: + GeneralGrad() = default; + static GeneralGrad* general_grad_; + // no_grad_vars's GradNode and GradNode's InputMeta. + std::unordered_map + no_grad_var_nodes_inputmeta_map; + // inputs's GradNode and GradNode's InputMeta. + std::unordered_map + input_target_nodes_inputmeta_map; + // Record all the potential startup_nodes, will be changed. + std::unordered_set potential_startup_nodes; + // Record all the potential stop nodes, will be changed. + std::unordered_set potential_stop_nodes; + std::unordered_map /* pre nodes */> + depending_nodes; + std::unordered_map results_map; + DISABLE_COPY_AND_ASSIGN(GeneralGrad); +}; +std::unordered_map getInDegreeMap( + const std::queue& init_queue) { // Calculate in_degree for each node + // We can completely remove this pass, if in_degree were set during forward + // pass std::unordered_map node_in_degree_map; // Copy nodes @@ -171,101 +369,30 @@ void GetGraphInfoBetweenTargets( } visited.insert(node); - // Check node is target_nodes or not, if node is not target_node, - // all the next_node will be marked in potential_stop_nodes - bool is_potential_stop_nodes = - input_target_nodes_inputmeta_map->count(node); - + PADDLE_ENFORCE_NOT_NULL( + node, + paddle::platform::errors::Fatal( + "We got null node when we traverse the backward graph, and this " + "should not happened please check your code and contact us.")); // Find and append next nodes const std::vector>& edges = node->GetEdges(); for (const auto& edge_list : edges) { for (const Edge& edge : edge_list) { GradNodeBase* next_node = edge.GetMutableGradNode().get(); - // Next node could be nullptr if it is leaf tensor with no // AccumulationNode attached // Or it could also originated from dispensable inputs if (!next_node) continue; - // if node not in input_target_nodes, - // all the next_nodes of current node will be inserted to - // potential_stop_node - if (is_potential_stop_nodes) { - potential_stop_nodes->emplace(next_node); - } - // Update in_degree if (!node_in_degree_map.count(next_node)) node_in_degree_map[next_node] = 0; node_in_degree_map[next_node]++; - - // Record depending relationship - (*depending_nodes)[next_node].emplace(node); queue.push(next_node); } } } - // Update Graph Info, remove some stop_node in potential_stop_nodes - UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes, - potential_stop_nodes, potential_startup_nodes); -} - -void GetTargetNodesInfo(const std::vector& inputs, - std::unordered_map* - target_nodes_inputmeta_map) { - VLOG(6) << "Running in GetTargetNodesInfo"; - if (!inputs.empty()) { - VLOG(6) << "Inputs are not empty"; - size_t num_inputs = inputs.size(); - for (size_t i = 0; i < num_inputs; i++) { - AutogradMeta* auto_grad_meta = - EagerUtils::unsafe_autograd_meta(inputs[i]); - auto target_node = auto_grad_meta->GetMutableGradNode().get(); - - PADDLE_ENFORCE_NOT_NULL(target_node, - paddle::platform::errors::Fatal( - "There is no grad op for input:%d or it's" - "stop_gradient=True", - i)); - (*target_nodes_inputmeta_map)[target_node] = auto_grad_meta; - } - } -} - -std::vector GetResults( - const std::vector& inputs, - std::unordered_map* - results_map, - bool allow_unused, bool create_graph) { - VLOG(6) << "Running in GetResults"; - if (inputs.empty()) return {}; - - std::vector results; - results.reserve(inputs.size()); - - for (size_t i = 0; i < inputs.size(); ++i) { - auto& input = inputs[i]; - AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input); - auto target_node = auto_grad_meta->GetMutableGradNode().get(); - - auto iter = results_map->find(target_node); - if (iter != results_map->end()) { - // set StopGradient = !create_graph - AutogradMeta* tensor_auto_grad_meta = - EagerUtils::autograd_meta(&(iter->second)); - tensor_auto_grad_meta->SetStopGradient(!create_graph); - results.emplace_back(iter->second); - } else { - PADDLE_ENFORCE_EQ(allow_unused, true, - paddle::platform::errors::InvalidArgument( - "The %d-th input does not appear in the backward " - "graph. Please check the input variable or set " - "allow_unused=True to get None result.", - i)); - results.emplace_back(); - } - } - return results; + return node_in_degree_map; } // Enforce GradNode has TensorWrappers as Input @@ -281,28 +408,23 @@ void EnforceGradNodeHasInput(GradNodeBase* node) { node->name())); } -// Purify potential_startup_nodes, remove nodes those are the same as -// input_target_nodes -void PurifyPotentialStartUpNodes( - std::unordered_set* potential_startup_nodes, - std::unordered_map* - input_target_nodes_inputmeta_map) { - VLOG(6) << "Running in PurifyPotentialStartUpNodes"; - if (input_target_nodes_inputmeta_map->empty()) return; - std::unordered_set potential_startup_nodes_to_be_erased; - for (auto startup_op : *potential_startup_nodes) { - auto iter = input_target_nodes_inputmeta_map->find(startup_op); - if (iter != input_target_nodes_inputmeta_map->end()) { - potential_startup_nodes_to_be_erased.emplace(iter->first); - } - } - if (!potential_startup_nodes_to_be_erased.empty()) { - for (auto nodes : potential_startup_nodes_to_be_erased) { - potential_startup_nodes->erase(nodes); - } +void DuplicateCheck(const std::vector& inputs, + bool is_input) { + std::unordered_set visisted_ins; + std::string msg = is_input ? "inputs" : "outputs"; + for (auto in : inputs) { + AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in); + PADDLE_ENFORCE_EQ( + visisted_ins.count(auto_grad_meta), 0, + paddle::platform::errors::AlreadyExists( + "%s contain duplicate tensor %s, please check %s carefully.", msg, + in.name(), msg)); + visisted_ins.insert(auto_grad_meta); } } +GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad(); + std::vector RunBackward( const std::vector& tensors, // output const std::vector& grad_tensors, @@ -315,10 +437,8 @@ std::vector RunBackward( // *Inplace version check should perform at node-level // *Cross-batch accumulation happens at forward pass - std::unordered_map - no_grad_var_nodes_inputmeta_map; - // Get no_grad_vars's GradNodes and InputMeta Info - GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map); + // GeneralGrad + bool is_general_grad = !inputs.empty(); /* --- Initialization --- */ // 1. Init queue with starting nodes @@ -326,7 +446,6 @@ std::vector RunBackward( std::queue queue; std::unordered_map> node_input_buffers_dict; - std::unordered_set potential_startup_nodes; for (size_t i = 0; i < tensors.size(); i++) { const paddle::experimental::Tensor& tensor = tensors[i]; @@ -363,7 +482,7 @@ std::vector RunBackward( paddle::platform::errors::Fatal( "Detected size mismatch between tensors and grad_tensors" "grad_tensors should either have " - "size = 0 or same size as tensors")); + "size = 0 or same size as tensors.")); // Feed given tensor if it's provided VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor"; @@ -391,7 +510,9 @@ std::vector RunBackward( // Prepare queue, potential startup_nodes queue.push(grad_node); - potential_startup_nodes.emplace(grad_node); + if (is_general_grad) { + GeneralGrad::Instance().GetPotentialStartupNodes()->emplace(grad_node); + } } VLOG(6) << "Update In degree Map for backward"; @@ -399,56 +520,13 @@ std::vector RunBackward( std::unordered_map node_in_degree_map = getInDegreeMap(queue); - // Get input's GradNodes and InputMeta Info - std::unordered_map - input_target_nodes_inputmeta_map; - GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map); - - // Purify potential_startup_ops, remove those nodes that are the same as - // input_target_nodes - PurifyPotentialStartUpNodes(&potential_startup_nodes, - &input_target_nodes_inputmeta_map); - - // Get Graph Info Betweent input target gradnode and outputs - // Record the depending_nodes and potential_stop_nodes - std::unordered_map /* father node */> - depending_nodes; - std::unordered_set potential_stop_nodes; - // std::unordered_set startup_ops; - - GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map, - &depending_nodes, &potential_stop_nodes, - &potential_startup_nodes); - - // ready_queue store all startup nodes - std::queue ready_queue; - // startup op's indegree should be 0 - for (auto node : potential_startup_nodes) { - if (node_in_degree_map[node] == 0) { - ready_queue.emplace(node); - } + if (is_general_grad) { + // Prepare several vital preprocess for GeneralGrad + GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue, + node_input_buffers_dict); } - VLOG(1) << " startup_ops' size is :" << ready_queue.size(); - - std::unordered_map results_map; - - // read_queue is empty only when 1.input equals to output. 2.input can not - // reach to output. - if (ready_queue.size() == 0) { - for (auto input_target_node : input_target_nodes_inputmeta_map) { - // out rank_info of forward op - auto rank_info = input_target_node.second->OutRankInfo(); - if (node_input_buffers_dict[input_target_node.first]) { - auto& target_result = - node_input_buffers_dict[input_target_node.first] - ->Buffers()[rank_info.first][rank_info.second]; - // save the target result - results_map[input_target_node.first] = target_result; - } - } - } + VLOG(6) << " startup_ops' size is :" << queue.size(); /* --- Topological Visit --- */ // 1. Pop queue @@ -458,53 +536,55 @@ std::vector RunBackward( // |- Prepare for next node // 3. Update queue VLOG(6) << "Run Backward"; - while (!ready_queue.empty()) { - GradNodeBase* node = ready_queue.front(); + while (!queue.empty()) { + GradNodeBase* node = queue.front(); VLOG(6) << "Running GradNode:" << node->name(); - ready_queue.pop(); paddle::platform::RecordEvent node_record_event( std::string(typeid(*node).name()) + " grad_node", paddle::platform::TracerEventType::Operator, 1); + if (queue.size() > 1 && node_in_degree_map[node] != 0) { + queue.pop(); + continue; + } + queue.pop(); + // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), paddle::platform::errors::Fatal( "Unable to find next node in the GradTensorHolder \n" - "Trying to run Node without configuring its GradTensorHolder")); + "Trying to run Node without configuring its GradTensorHolder.")); std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); - // get target grad_var from node_input_buffer by inputmeta - if (input_target_nodes_inputmeta_map.find(node) != - input_target_nodes_inputmeta_map.end()) { - VLOG(6) << "Get target result by by inputmeta"; - // out rank_info of forward op - auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo(); - // rank_info is a pair, first means slot_id, second means rank. - auto& target_result = - node_input_buffer->Buffers()[rank_info.first][rank_info.second]; - // save the target result - results_map[node] = target_result; + // Set input target grad_var from node_input_buffer by inputmeta + if (!inputs.empty() && is_general_grad) { + GeneralGrad::Instance().SetResultForInputTargetVar(*node_input_buffer, + node); } // no_grad_vars - if (no_grad_var_nodes_inputmeta_map.find(node) != - no_grad_var_nodes_inputmeta_map.end()) { - VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; - auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo(); - node_input_buffer->SetBufferSlotRankZeros(rank_info.first, - rank_info.second); + if (!no_grad_vars.empty() && is_general_grad) { + auto iter = + GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node); + if (iter != + GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) { + VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; + auto rank_info = (iter->second)->OutRankInfo(); + node_input_buffer->SetBufferSlotRankZeros(rank_info.first, + rank_info.second); + } } VLOG(6) << "Running GradNode:" << node->name(); - // check input + // Check input EnforceGradNodeHasInput(node); - VLOG(6) << "Run Backward Kernel with GradTensorHolder"; + VLOG(6) << "Run Backward Kernel with GradTensorHolder."; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = (*node)(node_input_buffer->Buffers(), create_graph); @@ -587,23 +667,29 @@ std::vector RunBackward( node_in_degree_map[next_node] >= 0, paddle::platform::errors::Fatal( "Detected in-degree value smaller than zero. For Node: %s" - "Node's in-degree cannot be negative", + "Node's in-degree cannot be negative.", next_node->name())); - bool is_potential_stop_node = potential_stop_nodes.count(next_node); - - if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { - ready_queue.emplace(std::move(next_node)); + if (is_general_grad) { + bool is_potential_stop_node = + GeneralGrad::Instance().GetPotentialStopNodes()->count(next_node); + if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { + queue.emplace(std::move(next_node)); + } + } else { + if (node_in_degree_map[next_node] == 0) { + queue.emplace(std::move(next_node)); + } } } } } - - return GetResults(inputs, &results_map, allow_unused, create_graph); + if (!is_general_grad) return {}; + return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph); } void Backward( - const std::vector& tensors, // output + const std::vector& tensors, // outputs const std::vector& grad_tensors, bool retain_graph) { VLOG(6) << "Run in Backward"; @@ -613,12 +699,16 @@ void Backward( } std::vector Grad( - const std::vector& tensors, // output + const std::vector& tensors, // outputs const std::vector& inputs, const std::vector& grad_tensors, bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, const std::vector& no_grad_vars) { VLOG(6) << "Run in Grad"; + + DuplicateCheck(inputs, true /* is_input */); + DuplicateCheck(tensors, false /* is_input */); + return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, allow_unused, no_grad_vars); } diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 163d25e85ce8c085087331c6e3273075aed5e5f4..038ad09aa4d8bef1282c024559b60d0eed7e48d1 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -93,7 +93,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, // Create new tensor->impl and fill it with 1.0 if (t.defined()) { // Fill 1.0 - buffer_[slot_id][rank] = paddle::experimental::ones_like(t); + buffer_[slot_id][rank] = paddle::experimental::ones_like(t, t.dtype()); } } } diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index b8f9f0bfec9b2a0bf6b6fb1e122e40b3eaa90fa8..3d1599a76e8ebcf8d379e6d44d6cc475ab4b0b33 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include "heter_comm.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" @@ -40,11 +41,13 @@ class GpuPsGraphTable : public HeterComm { int sample_size, int len); NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); void clear_graph_info(); - void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num, - int sample_size, int *h_left, - int *h_right, - int64_t *src_sample_res, - int *actual_sample_size); + void move_neighbor_sample_result_to_source_gpu( + int gpu_id, int gpu_num, int *h_left, int *h_right, + int64_t *src_sample_res, thrust::host_vector &total_sample_size); + void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num, + int *h_left, int *h_right, + int *actual_sample_size, + int *total_sample_size); int init_cpu_table(const paddle::distributed::GraphParameter &graph); int load(const std::string &path, const std::string ¶m); virtual int32_t end_graph_sampling() { diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h index 16a6857ae96eecaaa06b92b9912387f22612f53e..acd3f0a290d0b1b40ef71dd11b2741452f41e773 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h @@ -13,10 +13,23 @@ // limitations under the License. #pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + #ifdef PADDLE_WITH_HETERPS //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" namespace paddle { namespace framework { + +constexpr int WARP_SIZE = 32; + /* comment 0 this kernel just serves as an example of how to sample nodes' neighbors. @@ -29,20 +42,79 @@ sample_size; */ -__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, - int* actual_size, - int64_t* sample_result, int sample_size, - int len) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { +struct MaxFunctor { + int sample_size; + HOSTDEVICE explicit inline MaxFunctor(int sample_size) { + this->sample_size = sample_size; + } + HOSTDEVICE inline int operator()(int x) const { + if (x > sample_size) { + return sample_size; + } + return x; + } +}; + +struct DegreeFunctor { + GpuPsCommGraph graph; + HOSTDEVICE explicit inline DegreeFunctor(GpuPsCommGraph graph) { + this->graph = graph; + } + HOSTDEVICE inline int operator()(int i) const { + return graph.node_list[i].neighbor_size; + } +}; + +template +__global__ void neighbor_sample(const uint64_t rand_seed, GpuPsCommGraph graph, + int sample_size, int* index, int len, + int64_t* sample_result, int* output_idx, + int* output_offset) { + assert(blockDim.x == WARP_SIZE); + assert(blockDim.y == BLOCK_WARPS); + + int i = blockIdx.x * TILE_SIZE + threadIdx.y; + const int last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, len); + curandState rng; + curand_init(rand_seed * gridDim.x + blockIdx.x, + threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); + + while (i < last_idx) { auto node_index = index[i]; - actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size - ? graph.node_list[node_index].neighbor_size - : sample_size; - int offset = graph.node_list[node_index].neighbor_offset; - for (int j = 0; j < actual_size[i]; j++) { - sample_result[sample_size * i + j] = graph.neighbor_list[offset + j]; + int degree = graph.node_list[node_index].neighbor_size; + const int offset = graph.node_list[node_index].neighbor_offset; + int output_start = output_offset[i]; + + if (degree <= sample_size) { + // Just copy + for (int j = threadIdx.x; j < degree; j += WARP_SIZE) { + sample_result[output_start + j] = graph.neighbor_list[offset + j]; + } + } else { + for (int j = threadIdx.x; j < degree; j += WARP_SIZE) { + output_idx[output_start + j] = j; + } + + __syncwarp(); + + for (int j = sample_size + threadIdx.x; j < degree; j += WARP_SIZE) { + const int num = curand(&rng) % (j + 1); + if (num < sample_size) { + atomicMax( + reinterpret_cast(output_idx + output_start + num), + static_cast(j)); + } + } + + __syncwarp(); + + for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) { + const int perm_idx = output_idx[output_start + j] + offset; + sample_result[output_start + j] = graph.neighbor_list[perm_idx]; + } } + + i += BLOCK_WARPS; } } @@ -79,7 +151,7 @@ int GpuPsGraphTable::load(const std::string& path, const std::string& param) { gpu i triggers a neighbor_sample task, when this task is done, this function is called to move the sample result on other gpu back - to gup i and aggragate the result. + to gpu i and aggragate the result. the sample_result is saved on src_sample_res and the actual sample size for each node is saved on actual_sample_size. the number of actual sample_result for @@ -96,10 +168,50 @@ int GpuPsGraphTable::load(const std::string& path, const std::string& param) { that's what fill_dvals does. */ +void GpuPsGraphTable::move_neighbor_sample_size_to_source_gpu( + int gpu_id, int gpu_num, int* h_left, int* h_right, int* actual_sample_size, + int* total_sample_size) { + // This function copyed actual_sample_size to source_gpu, + // and calculate total_sample_size of each gpu sample number. + for (int i = 0; i < gpu_num; i++) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + auto shard_len = h_right[i] - h_left[i] + 1; + auto& node = path_[gpu_id][i].nodes_.front(); + cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), + node.val_storage + sizeof(int) * shard_len, + sizeof(int) * shard_len, cudaMemcpyDefault, + node.out_stream); + } + for (int i = 0; i < gpu_num; ++i) { + if (h_left[i] == -1 || h_right[i] == -1) { + total_sample_size[i] = 0; + continue; + } + auto& node = path_[gpu_id][i].nodes_.front(); + cudaStreamSynchronize(node.out_stream); + + auto shard_len = h_right[i] - h_left[i] + 1; + thrust::device_vector t_actual_sample_size(shard_len); + thrust::copy(actual_sample_size + h_left[i], + actual_sample_size + h_left[i] + shard_len, + t_actual_sample_size.begin()); + total_sample_size[i] = thrust::reduce(t_actual_sample_size.begin(), + t_actual_sample_size.end()); + } +} void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( - int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right, - int64_t* src_sample_res, int* actual_sample_size) { + int gpu_id, int gpu_num, int* h_left, int* h_right, int64_t* src_sample_res, + thrust::host_vector& total_sample_size) { + /* + if total_sample_size is [4, 5, 1, 6], + then cumsum_total_sample_size is [0, 4, 9, 10]; + */ + thrust::host_vector cumsum_total_sample_size(gpu_num, 0); + thrust::exclusive_scan(total_sample_size.begin(), total_sample_size.end(), + cumsum_total_sample_size.begin(), 0); for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { continue; @@ -109,14 +221,10 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( // auto& node = path_[gpu_id][i].nodes_[cur_step]; auto& node = path_[gpu_id][i].nodes_.front(); cudaMemcpyAsync( - reinterpret_cast(src_sample_res + h_left[i] * sample_size), + reinterpret_cast(src_sample_res + cumsum_total_sample_size[i]), node.val_storage + sizeof(int64_t) * shard_len, - node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault, + sizeof(int64_t) * total_sample_size[i], cudaMemcpyDefault, node.out_stream); - cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), - node.val_storage + sizeof(int) * shard_len, - sizeof(int) * shard_len, cudaMemcpyDefault, - node.out_stream); } for (int i = 0; i < gpu_num; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -131,17 +239,35 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( TODO: how to optimize it to eliminate the for loop */ -__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals, - int* d_shard_actual_sample_size, - int* d_actual_sample_size, int* idx, - int sample_size, int len) { +__global__ void fill_dvalues_actual_sample_size(int* d_shard_actual_sample_size, + int* d_actual_sample_size, + int* idx, int len) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i]; - // d_vals[idx[i]] = d_shard_vals[i]; - for (int j = 0; j < sample_size; j++) { - d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j]; + } +} + +template +__global__ void fill_dvalues_sample_result(int64_t* d_shard_vals, + int64_t* d_vals, + int* d_actual_sample_size, int* idx, + int* offset, int* d_offset, + int len) { + assert(blockDim.x == WARP_SIZE); + assert(blockDim.y == BLOCK_WARPS); + + int i = blockIdx.x * TILE_SIZE + threadIdx.y; + const int last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, len); + while (i < last_idx) { + const int sample_size = d_actual_sample_size[idx[i]]; + for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) { + d_vals[offset[idx[i]] + j] = d_shard_vals[d_offset[i] + j]; } +#ifdef PADDLE_WITH_CUDA + __syncwarp(); +#endif + i += BLOCK_WARPS; } } @@ -255,14 +381,12 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, h_left = [0,5],h_right = [4,8] */ + NeighborSampleResult* result = new NeighborSampleResult(sample_size, len); if (len == 0) { return result; } - cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t)); - cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int)); - int* actual_sample_size = result->actual_sample_size; - int64_t* val = result->val; + int total_gpu = resource_->total_gpu(); int dev_id = resource_->dev_id(gpu_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); @@ -287,11 +411,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t)); - int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); - int* d_shard_actual_sample_size_ptr = - reinterpret_cast(d_shard_actual_sample_size->ptr()); split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id); @@ -331,6 +450,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, of alloc_mem_i, actual_sample_size_of_x equals ((int *)alloc_mem_i)[shard_len + x] */ + create_storage(gpu_id, i, shard_len * sizeof(int64_t), shard_len * (1 + sample_size) * sizeof(int64_t)); } @@ -351,6 +471,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, h_right[i] - h_left[i] + 1, resource_->remote_stream(i, gpu_id)); } + for (int i = 0; i < total_gpu; ++i) { if (h_left[i] == -1) { continue; @@ -364,10 +485,42 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, int* res_array = reinterpret_cast(node.val_storage); int* actual_size_array = res_array + shard_len; int64_t* sample_array = (int64_t*)(res_array + shard_len * 2); - neighbor_sample_example<<remote_stream(i, gpu_id)>>>( - graph, res_array, actual_size_array, sample_array, sample_size, - shard_len); + + // 1. get actual_size_array. + // 2. get sum of actual_size. + // 3. get offset ptr + thrust::device_vector t_res_array(shard_len); + thrust::copy(res_array, res_array + shard_len, t_res_array.begin()); + thrust::device_vector t_actual_size_array(shard_len); + thrust::transform(t_res_array.begin(), t_res_array.end(), + t_actual_size_array.begin(), DegreeFunctor(graph)); + + if (sample_size >= 0) { + thrust::transform(t_actual_size_array.begin(), t_actual_size_array.end(), + t_actual_size_array.begin(), MaxFunctor(sample_size)); + } + + thrust::copy(t_actual_size_array.begin(), t_actual_size_array.end(), + actual_size_array); + + int total_sample_sum = + thrust::reduce(t_actual_size_array.begin(), t_actual_size_array.end()); + + thrust::device_vector output_idx(total_sample_sum); + thrust::device_vector output_offset(shard_len); + thrust::exclusive_scan(t_actual_size_array.begin(), + t_actual_size_array.end(), output_offset.begin(), 0); + + constexpr int BLOCK_WARPS = 128 / WARP_SIZE; + constexpr int TILE_SIZE = BLOCK_WARPS * 16; + const dim3 block_(WARP_SIZE, BLOCK_WARPS); + const dim3 grid_((shard_len + TILE_SIZE - 1) / TILE_SIZE); + neighbor_sample< + BLOCK_WARPS, + TILE_SIZE><<remote_stream(i, gpu_id)>>>( + 0, graph, sample_size, res_array, shard_len, sample_array, + thrust::raw_pointer_cast(output_idx.data()), + thrust::raw_pointer_cast(output_offset.data())); } for (int i = 0; i < total_gpu; ++i) { @@ -378,13 +531,56 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, tables_[i]->rwlock_->UNLock(); } // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr); - move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size, - h_left, h_right, d_shard_vals_ptr, - d_shard_actual_sample_size_ptr); - fill_dvalues<<>>( - d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size, - d_idx_ptr, sample_size, len); + auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); + int* d_shard_actual_sample_size_ptr = + reinterpret_cast(d_shard_actual_sample_size->ptr()); + // Store total sample number of each gpu. + thrust::host_vector d_shard_total_sample_size(total_gpu, 0); + move_neighbor_sample_size_to_source_gpu( + gpu_id, total_gpu, h_left, h_right, d_shard_actual_sample_size_ptr, + thrust::raw_pointer_cast(d_shard_total_sample_size.data())); + int allocate_sample_num = 0; + for (int i = 0; i < total_gpu; ++i) { + allocate_sample_num += d_shard_total_sample_size[i]; + } + auto d_shard_vals = + memory::Alloc(place, allocate_sample_num * sizeof(int64_t)); + int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, h_left, h_right, + d_shard_vals_ptr, + d_shard_total_sample_size); + + cudaMalloc((void**)&result->val, allocate_sample_num * sizeof(int64_t)); + cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int)); + cudaMalloc((void**)&result->offset, len * sizeof(int)); + int64_t* val = result->val; + int* actual_sample_size = result->actual_sample_size; + int* offset = result->offset; + + fill_dvalues_actual_sample_size<<>>( + d_shard_actual_sample_size_ptr, actual_sample_size, d_idx_ptr, len); + thrust::device_vector t_actual_sample_size(len); + thrust::copy(actual_sample_size, actual_sample_size + len, + t_actual_sample_size.begin()); + thrust::exclusive_scan(t_actual_sample_size.begin(), + t_actual_sample_size.end(), offset, 0); + int* d_offset; + cudaMalloc(&d_offset, len * sizeof(int)); + thrust::copy(d_shard_actual_sample_size_ptr, + d_shard_actual_sample_size_ptr + len, + t_actual_sample_size.begin()); + thrust::exclusive_scan(t_actual_sample_size.begin(), + t_actual_sample_size.end(), d_offset, 0); + constexpr int BLOCK_WARPS_ = 128 / WARP_SIZE; + constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16; + const dim3 block__(WARP_SIZE, BLOCK_WARPS_); + const dim3 grid__((len + TILE_SIZE_ - 1) / TILE_SIZE_); + fill_dvalues_sample_result<<>>( + d_shard_vals_ptr, val, actual_sample_size, d_idx_ptr, offset, d_offset, + len); + cudaStreamSynchronize(stream); for (int i = 0; i < total_gpu; ++i) { int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; @@ -393,6 +589,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, } destroy_storage(gpu_id, i); } + cudaFree(d_offset); return result; } diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu index 697e0ba2cdf3475d1e7ad48105bc55959461900f..06c7026eb51ca8ed808d528391ab6723fd83831c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu @@ -94,19 +94,44 @@ TEST(TEST_FLEET, graph_comm) { 0 --index--->0 7 --index-->2 */ + int64_t cpu_key[3] = {7, 0, 6}; void *key; cudaMalloc((void **)&key, 3 * sizeof(int64_t)); cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); - res = new int64_t[9]; - cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); - int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; - for (int i = 0; i < 9; i++) { - if (expected_sample_val[i] != -1) { - ASSERT_EQ(res[i], expected_sample_val[i]); + res = new int64_t[7]; + cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost); + int *actual_sample_size = new int[3]; + cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12, + cudaMemcpyDeviceToHost); // 3, 1, 3 + int *cumsum_sample_size = new int[3]; + cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12, + cudaMemcpyDeviceToHost); // 0, 3, 4 + + std::vector> neighbors_; + std::vector neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35}; + std::vector neighbors_0 = {0}; + std::vector neighbors_6 = {21, 22, 23, 24, 25, 26, 27}; + neighbors_.push_back(neighbors_7); + neighbors_.push_back(neighbors_0); + neighbors_.push_back(neighbors_6); + for (int i = 0; i < 3; i++) { + for (int j = cumsum_sample_size[i]; + j < cumsum_sample_size[i] + actual_sample_size[i]; j++) { + bool flag = false; + for (int k = 0; k < neighbors_[i].size(); k++) { + if (res[j] == neighbors_[i][k]) { + flag = true; + break; + } + } + ASSERT_EQ(flag, true); } } + delete[] res; + delete[] actual_sample_size; + delete[] cumsum_sample_size; delete neighbor_sample_res; } diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index 9fe50deaf2d72679bc5c41038936d01cad9de498..7cdb7a8854aad01d6589b0c5d555e07a23377a61 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -25,14 +25,14 @@ std::set ignored_ops = { "sum", "clip", "clip_by_norm", - "square", "reduce_sum", "sqrt", "elementwise_max", "elementwise_div", "elementwise_mul", - "scale", // adamax - "assign", // adamw + "scale", // adamax + "assign", // adamw + "squared_l2_norm" // gradient_clip_norm }; const bool startswith(const std::string& str, const std::string& pre) { @@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { new_op.SetAttr("with_lr_sched", false); std::set set_ops{}; + // save the weight decay tensor_name and weight_decay_value for Lamb + std::vector weight_decay_vars{}; + std::vector weight_decay_values{}; + // use map store ? for (auto* node : graph->Nodes()) { if (!node->IsOp()) { @@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { auto op_role = static_cast(op_role_); if (op_role == OpRole::kOptimize) { + // save weight decay value from every lamb optimizer op + if (op_type == "lamb" && op->HasAttr("weight_decay")) { + auto weight_decay_value = + BOOST_GET_CONST(float, op->GetAttr("weight_decay")); + auto params = op->Output("ParamOut"); + weight_decay_vars.push_back(params[0]); + weight_decay_values.push_back(weight_decay_value); + } + if (set_ops.count(op_type)) { continue; } @@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { // seems with_lr_sched is always true new_op.SetAttr("with_lr_sched", true); - // setup weight deacy + // setup weight decay for Lamb + new_op.SetAttr("weight_decay_vars", weight_decay_vars); + new_op.SetAttr("weight_decay_values", weight_decay_values); + // weight_decay/coeff is "scale" attr of scale_op if (set_ops.count("scale") && set_ops.count("sum")) { if (set_ops.count("sign")) { diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc index e754ba72ad857b8a0d366d5e97fbdcf20fa882d3..5cd8358dc083eb3f76eddcba79dae2d9352c11e2 100644 --- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc +++ b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc @@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const { auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16; - if (enable_fp16) { + auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op; + if (enable_fp16 && transfer_cast_op) { for (auto* node : graph->Nodes()) { if (node->IsOp() && node->Op()->Type() == "popart_cast") { if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) == diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index bf2cf58f970addf1dac9f4871ba4abe09c3c7b38..17663ecf6baa35f698aca35e451de34c647d2214 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -28,7 +28,7 @@ USE_OP_ITSELF(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); -USE_OP(conv2d_transpose); +USE_OP_ITSELF(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 5de861235461ff6670503f6372961bdcf0be5ec2..e8cd84248ea852ba2defff19cfe0d2bd2813c435 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_IPU - else if (platform::is_ipu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_cpu_place(src_place) && - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_ipu_place(src_place) && - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { @@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, "Copying from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_IPU + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copying from %s to %s is not supported.", src_place, dst_place)); + } +#endif } template @@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_IPU - else if (platform::is_ipu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else { // NOLINT - PADDLE_THROW(platform::errors::Unimplemented( - "Copy from %s to %s is not supported.", src_place, dst_place)); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/ memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } + } // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/ memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } + } // NOLINT else if (platform::is_custom_place(src_place) && // NOLINT platform::is_custom_place( dst_place)) { /* custom_device -> custom_device*/ @@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } + } // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } + } // NOLINT else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { if (src_ptr == dst_ptr) { @@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place); xpu_ctx->Wait(); } - } + } // NOLINT else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( "Copy from %s to %s is not supported.", src_place, dst_place)); @@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_IPU + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif } template diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index fec9afbf3b403ca2fd45633326c7f7dec46e1243..03fa46eab53678e5464f478a2dc0e42f0ea5210b 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -1109,8 +1109,9 @@ void Reducer::FinalizeBackward() { if (find_unused_vars_each_step_) { // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_CNCL) ProcessUnusedDenseVars(); #endif // Initialize local used vars diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index b96992ef8514abe0f71dbf23d38abb626f6c4a5b..a856d1414446914909a1801d4175431896ee8de1 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" USE_OP_ITSELF(conv2d); -USE_OP(conv2d_transpose); +USE_OP_ITSELF(conv2d_transpose); namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index bc29c92b0942620daa65d12ca4a68a36f1f17ea9..8a190c1a1e0911ab10c0bcbfa025df57f4eadcdf 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor { out_var_->GetMutable(); if (platform::is_same_place(in_tensor.place(), place_)) { out_tensor->ShareDataWith(in_tensor); +#ifdef PADDLE_WITH_IPU + } else if (platform::is_ipu_place(place_)) { + // For ipu, both in_tensor and out_tensor are allocated on cpu, + // PopART will copy tensor from host automatically, + // no TensorCopy() is required here. + out_tensor->ShareDataWith(in_tensor); +#endif } else { platform::DeviceContext *context = platform::DeviceContextPool::Instance().Get(place_); diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc index ddfc6fe862c27a61c81e2bce694377ea1348a8b5..e4751f1f26008c3d443fc0126d3e6d68995a44e0 100644 --- a/paddle/fluid/operators/conv_op_xpu.cc +++ b/paddle/fluid/operators/conv_op_xpu.cc @@ -19,14 +19,16 @@ namespace operators { template class GemmConvXPUKernel : public framework::OpKernel { + using XPUT = typename XPUTypeTrait::Type; + public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *input = context.Input("Input"); // The filter will be reshaped in the calculations, // so here use an assignment operation, // that avoids modifying the variable in the Scope. Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); + Tensor *output = context.Output("Output"); output->mutable_data(context.GetPlace()); int groups = context.Attr("groups"); std::vector strides = context.Attr>("strides"); @@ -53,11 +55,16 @@ class GemmConvXPUKernel : public framework::OpKernel { const int img_h = static_cast(input->dims()[2]); const int img_w = static_cast(input->dims()[3]); const int f = static_cast(filter.dims()[0]); - auto& dev_ctx = context.template device_context(); - int r = xpu::conv2d( - dev_ctx.x_context(), input->data(), filter.data(), - output->data(), batch_size, img_c, img_h, img_w, f, ksize, - strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true); + + const XPUT *input_data = reinterpret_cast(input->data()); + const XPUT *filter_data = reinterpret_cast(filter.data()); + XPUT *output_data = reinterpret_cast(output->data()); + + auto &dev_ctx = context.template device_context(); + int r = xpu::conv2d( + dev_ctx.x_context(), input_data, filter_data, output_data, batch_size, + img_c, img_h, img_w, f, ksize, strides, paddings, dilations, groups, + nullptr, nullptr, nullptr, true); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External("XPU conv kernel return wrong value[%d %s]", @@ -67,14 +74,16 @@ class GemmConvXPUKernel : public framework::OpKernel { template class GemmConvGradXPUKernel : public framework::OpKernel { + using XPUT = typename XPUTypeTrait::Type; + public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *input = context.Input("Input"); + const Tensor *output_grad = context.Input(framework::GradVarName("Output")); - Tensor* input_grad = + Tensor *input_grad = context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = + Tensor *filter_grad = context.Output(framework::GradVarName("Filter")); // The filter and filter_grad will be reshaped in the calculations, // so here use an assignment operation, @@ -107,19 +116,27 @@ class GemmConvGradXPUKernel : public framework::OpKernel { const int img_h = static_cast(input->dims()[2]); const int img_w = static_cast(input->dims()[3]); const int f = static_cast(filter.dims()[0]); + + const XPUT *input_data = reinterpret_cast(input->data()); + const XPUT *filter_data = reinterpret_cast(filter.data()); + const XPUT *output_grad_data = + reinterpret_cast(output_grad->data()); + XPUT *input_grad_data = nullptr; if (input_grad) { input_grad->mutable_data(context.GetPlace()); + input_grad_data = reinterpret_cast(input_grad->data()); } + XPUT *filter_grad_data = nullptr; if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); + filter_grad_data = reinterpret_cast(filter_grad->data()); } - auto& dev_ctx = context.template device_context(); - int r = xpu::conv2d_grad( - dev_ctx.x_context(), input->data(), filter.data(), - output_grad->data(), input_grad ? input_grad->data() : nullptr, - filter_grad ? filter_grad->data() : nullptr, batch_size, img_c, - img_h, img_w, f, ksize, strides, paddings, dilations, groups, nullptr, - nullptr, nullptr, nullptr, nullptr, true); + auto &dev_ctx = context.template device_context(); + int r = xpu::conv2d_grad( + dev_ctx.x_context(), input_data, filter_data, output_grad_data, + input_grad_data, filter_grad_data, batch_size, img_c, img_h, img_w, f, + ksize, strides, paddings, dilations, groups, nullptr, nullptr, nullptr, + nullptr, nullptr, true); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External("XPU conv kernel return wrong value[%d %s]", @@ -130,14 +147,22 @@ class GemmConvGradXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - depthwise_conv2d, - ops::GemmConvXPUKernel); -REGISTER_OP_XPU_KERNEL( - conv2d, ops::GemmConvXPUKernel); + conv2d, ops::GemmConvXPUKernel, + ops::GemmConvXPUKernel); REGISTER_OP_XPU_KERNEL( conv2d_grad, - ops::GemmConvGradXPUKernel); + ops::GemmConvGradXPUKernel, + ops::GemmConvGradXPUKernel); +REGISTER_OP_XPU_KERNEL( + depthwise_conv2d, + ops::GemmConvXPUKernel, + ops::GemmConvXPUKernel); REGISTER_OP_XPU_KERNEL( depthwise_conv2d_grad, - ops::GemmConvGradXPUKernel); + ops::GemmConvGradXPUKernel, + ops::GemmConvGradXPUKernel); #endif diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu deleted file mode 100644 index 1841b78af32dd95d6884d5eb78ad30322ba7723e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ /dev/null @@ -1,1286 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif -#include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/padding.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void DataTranspose(const framework::ExecutionContext& ctx, - const Tensor* input, Tensor* output, - const std::vector& axis, int flag = 0) { - auto& dev_ctx = ctx.template device_context(); - phi::funcs::Transpose transpose; - auto in_dims = input->dims(); - std::vector input_transpose_vec; - for (size_t i = 0; i < axis.size(); ++i) { - if (flag == 0) - input_transpose_vec.push_back(in_dims[axis[i]]); - else - input_transpose_vec.push_back(in_dims[i]); - } - framework::DDim input_transpose_dims(phi::make_ddim(input_transpose_vec)); - output->mutable_data(input_transpose_dims, ctx.GetPlace()); - transpose(dev_ctx, *input, output, axis); -} - -template -class CUDNNConvTransposeOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - // cudnn v5 does not support dilations - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const T* filter_data = filter->data(); - const std::string data_layout_str = ctx.Attr("data_format"); - const paddle::platform::DataLayout data_layout = - (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW - : platform::DataLayout::kNHWC); - - // if channel_last, transpose to channel_first - Tensor input_transpose; - std::vector input_vec = phi::vectorize(input->dims()); - std::vector output_vec = phi::vectorize(output->dims()); - if (data_layout == platform::DataLayout::kNHWC) { - if (strides.size() == 2U) { - std::vector axis = {0, 3, 1, 2}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - } else if (strides.size() == 3U) { - std::vector axis = {0, 4, 1, 2, 3}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - } - } else { - input_transpose = *input; - } - - // update padding and dilation - auto in_dims = input_transpose.dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - - std::vector input_pad(input_transpose.dims().size() * 2, 0); - Tensor transformed_input; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = input_transpose.dims()[0]; - new_input_shape_vec[1] = input_transpose.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - input_transpose.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - const int rank = input_transpose.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, input_transpose, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, input_transpose, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.")); - } - } else { - transformed_input = input_transpose; - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - std::vector starts(data_dim, 0); - std::vector ends(data_dim, 0); - std::vector axes(data_dim, 0); - for (size_t i = 0; i < data_dim; ++i) { - starts[i] = input_pad[2 * i + 4] * (strides[i] + 1); - ends[i] = starts[i] + output_vec[i + 2]; - axes[i] = i + 2; - } - - const T* input_data = transformed_input.data(); - input_vec = phi::vectorize(transformed_input.dims()); - - std::vector transformed_output_vec = output_vec; - for (size_t i = 0; i < data_dim; ++i) { - transformed_output_vec[i + 2] = - output_vec[i + 2] + - (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] - - 2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1]; - } - - Tensor transformed_output; - if (!is_sys_pad) { - DDim transformed_output_shape(phi::make_ddim(transformed_output_vec)); - transformed_output.mutable_data(transformed_output_shape, - ctx.GetPlace()); - } else { - output->mutable_data(ctx.GetPlace()); - transformed_output.ShareDataWith(*output); - transformed_output.Resize(phi::make_ddim(transformed_output_vec)); - } - T* transformed_output_data = transformed_output.data(); - - platform::DataLayout layout; - - int iwo_groups = groups; - int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (strides.size() == 2U) { - layout = platform::DataLayout::kNCHW; - } else { - layout = platform::DataLayout::kNCDHW; - } - - size_t workspace_size = 0; -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t algo{}; -#else - cudnnConvolutionBwdDataAlgo_t algo{}; -#endif - // ------------------- cudnn conv algorithm --------------------- - auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); - auto layout_tensor = GetCudnnTensorFormat(layout); - bool deterministic = FLAGS_cudnn_deterministic; - - auto dtype = platform::CudnnDataType::type; - // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_output, - filter, - &transformed_input, - strides, - padding_common, - dilations, - dtype}; - args.handle = handle; - args.idesc.set(transformed_output, iwo_groups); - args.wdesc.set(*filter, layout_tensor, iwo_groups); - args.odesc.set(transformed_input, iwo_groups); - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); - -#ifdef PADDLE_WITH_HIP - using search = SearchAlgorithm; - workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find( - args, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search = SearchAlgorithm; - algo = search::Find( - args, false, deterministic, - ctx.template device_context()); - workspace_size = - std::max(workspace_size, search::GetWorkspaceSize(args, algo)); -#endif - - // ------------------- cudnn conv transpose forward --------------------- - int input_offset = - transformed_input.numel() / transformed_input.dims()[0] / groups; - int output_offset = - transformed_output.numel() / transformed_output.dims()[0] / groups; - int filter_offset = filter->numel() / groups; - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - for (int g = 0; g < groups; g++) { -#ifdef PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args.odesc.desc(), - input_data + input_offset * g, args.wdesc.desc(), - filter_data + filter_offset * g, args.cdesc.desc(), algo, &beta, - args.idesc.desc(), transformed_output_data + output_offset * g, - cudnn_workspace, workspace_size)); - }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args.wdesc.desc(), - filter_data + filter_offset * g, args.odesc.desc(), - input_data + input_offset * g, args.cdesc.desc(), algo, - cudnn_workspace, workspace_size, &beta, args.idesc.desc(), - transformed_output_data + output_offset * g)); - }; -#endif // PADDLE_WITH_HIP - workspace_handle.RunFunc(cudnn_func, workspace_size); - } - if (!is_sys_pad && strides.size() == 2U) { - Slice( - ctx, &transformed_output, output, starts, ends, axes); - } else if (!is_sys_pad && strides.size() == 3U) { - Slice( - ctx, &transformed_output, output, starts, ends, axes); - } - - if (data_layout == platform::DataLayout::kNHWC) { - Tensor output_transpose; - Tensor output_nchw; - output_nchw.ShareDataWith(*output); - output_nchw.Resize(phi::make_ddim(output_vec)); - if (strides.size() == 2U) { - std::vector axis = {0, 2, 3, 1}; - DataTranspose(ctx, &output_nchw, &output_transpose, axis); - *output = output_transpose; - } else if (strides.size() == 3U) { - std::vector axis = {0, 2, 3, 4, 1}; - DataTranspose(ctx, &output_nchw, &output_transpose, axis); - *output = output_transpose; - } - } - } -}; - -template -class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = ctx.Input(framework::GradVarName("Output")); - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - const T* filter_data = filter->data(); - - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - // cudnn v5 does not support dilations - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - int user_workspace_size = ctx.Attr("workspace_size_MB"); - const std::string data_layout_str = ctx.Attr("data_format"); - const paddle::platform::DataLayout data_layout = - (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW - : platform::DataLayout::kNHWC); - - // if channel_last, transpose to channel_first - Tensor input_transpose; - Tensor output_grad_transpose; - std::vector input_vec = phi::vectorize(input->dims()); - std::vector output_vec = phi::vectorize(output_grad->dims()); - if (data_layout == platform::DataLayout::kNHWC) { - if (strides.size() == 2U) { - std::vector axis = {0, 3, 1, 2}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output_grad->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - DataTranspose(ctx, output_grad, &output_grad_transpose, axis); - } else if (strides.size() == 3U) { - std::vector axis = {0, 4, 1, 2, 3}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output_grad->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - DataTranspose(ctx, output_grad, &output_grad_transpose, axis); - } - } else { - input_transpose = *input; - output_grad_transpose = *output_grad; - } - - // update padding and dilation - auto in_dims = input_transpose.dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - - std::vector input_pad(input_transpose.dims().size() * 2, 0); - Tensor transformed_output_grad; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_output_grad_shape_vec(data_dim + 2); - new_output_grad_shape_vec[0] = output_grad_transpose.dims()[0]; - new_output_grad_shape_vec[1] = output_grad_transpose.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_output_grad_shape_vec[i + 2] = - output_grad_transpose.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_output_grad_shape( - phi::make_ddim(new_output_grad_shape_vec)); - transformed_output_grad.Resize(new_output_grad_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_output_grad = - ctx.AllocateTmpTensor( - new_output_grad_shape, dev_ctx); - const int rank = input_transpose.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, output_grad_transpose, pad_value, - &transformed_output_grad); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, output_grad_transpose, pad_value, - &transformed_output_grad); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.")); - } - } else { - transformed_output_grad = output_grad_transpose; - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = input_transpose.data(); - const T* output_grad_data = transformed_output_grad.data(); - output_vec = phi::vectorize(transformed_output_grad.dims()); - - // ------------------- cudnn descriptors --------------------- - platform::DataLayout layout; - - if (strides.size() == 2U) { - layout = platform::DataLayout::kNCHW; - } else { - layout = platform::DataLayout::kNCDHW; - } - - int iwo_groups = groups; - int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - auto dtype = platform::CudnnDataType::type; - - ConvArgs args1{&transformed_output_grad, - filter, - &input_transpose, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_output_grad, - filter, - &input_transpose, - strides, - padding_common, - dilations, - dtype}; - -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t data_algo{}; - miopenConvBwdWeightsAlgorithm_t filter_algo{}; -#else - cudnnConvolutionFwdAlgo_t data_algo{}; - cudnnConvolutionBwdFilterAlgo_t filter_algo{}; -#endif - - auto layout_tensor = GetCudnnTensorFormat(layout); - size_t workspace_size = 0; - auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); - bool deterministic = FLAGS_cudnn_deterministic; - T* input_grad_data = nullptr; - T* filter_grad_data = nullptr; - - if (input_grad) { - input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - args1.handle = handle; - args1.idesc.set(transformed_output_grad, iwo_groups); - args1.wdesc.set(*filter, layout_tensor, iwo_groups); - args1.odesc.set(input_transpose, iwo_groups); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find( - args1, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search1 = SearchAlgorithm; - data_algo = search1::Find( - args1, false, deterministic, - ctx.template device_context()); - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); -#endif - } - - if (filter_grad) { - filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); - args2.handle = handle; - args2.idesc.set(transformed_output_grad, iwo_groups); - args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups); - args2.odesc.set(input_transpose, iwo_groups); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find( - args2, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search2 = SearchAlgorithm; - filter_algo = search2::Find( - args2, false, deterministic, - ctx.template device_context()); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, filter_algo)); -#endif - } - - // ------------------- cudnn conv backward data --------------------- - // FIXME(typhoonzero): template type T may not be the same as cudnn call. - int input_offset = input->numel() / input->dims()[0] / groups; - int output_grad_offset = transformed_output_grad.numel() / - transformed_output_grad.dims()[0] / groups; - int filter_offset = filter->numel() / groups; - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - if (input_grad) { - // Because beta is zero, it is unnecessary to reset input_grad. - for (int g = 0; g < groups; g++) { -#ifdef PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), - output_grad_data + output_grad_offset * g, args1.wdesc.desc(), - filter_data + filter_offset * g, args1.cdesc.desc(), - data_algo, &beta, args1.odesc.desc(), - input_grad_data + input_offset * g, cudnn_workspace, - workspace_size)); - }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - output_grad_data + output_grad_offset * g, args1.wdesc.desc(), - filter_data + filter_offset * g, args1.cdesc.desc(), data_algo, - cudnn_workspace, workspace_size, &beta, args1.odesc.desc(), - input_grad_data + input_offset * g)); - }; -#endif // PADDLE_WITH_HIP - workspace_handle.RunFunc(cudnn_func, workspace_size); - } - - if (data_layout == platform::DataLayout::kNHWC) { - Tensor input_grad_transpose; - Tensor input_grad_nchw; - input_grad_nchw.ShareDataWith(*input_grad); - input_grad_nchw.Resize(phi::make_ddim(input_vec)); - if (strides.size() == 2U) { - std::vector axis = {0, 2, 3, 1}; - DataTranspose(ctx, &input_grad_nchw, &input_grad_transpose, - axis); - *input_grad = input_grad_transpose; - } else if (strides.size() == 3U) { - std::vector axis = {0, 2, 3, 4, 1}; - DataTranspose(ctx, &input_grad_nchw, &input_grad_transpose, - axis); - *input_grad = input_grad_transpose; - } - } - } - - // ------------------- cudnn conv backward filter --------------------- - if (filter_grad) { - // Because beta is zero, it is unnecessary to reset filter_grad. - // Gradient with respect to the filter - for (int g = 0; g < groups; g++) { -#ifdef PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), - input_data + input_offset * g, args2.idesc.desc(), - output_grad_data + output_grad_offset * g, args2.cdesc.desc(), - filter_algo, &beta, args2.wdesc.desc(), - filter_grad_data + filter_offset * g, cudnn_workspace, - workspace_size)); - }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - output_grad_data + output_grad_offset * g, args2.odesc.desc(), - input_data + input_offset * g, args2.cdesc.desc(), - filter_algo, cudnn_workspace, workspace_size, &beta, - args2.wdesc.desc(), filter_grad_data + filter_offset * g)); - }; -#endif // PADDLE_WITH_HIP - workspace_handle.RunFunc(cudnn_func, workspace_size); - } - } - } -}; - -/* - * Inputs: I, W, dO, ddI, ddW - * Outputs: ddO, dW, dI - * ddo = conv_bp_data(W, ddI) + conv_bp_data(ddW, I) - * dW = conv_bp_filter(dO, ddI) - * dI = conv(dO, ddW) - */ -template -class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto X = ctx.Input("Input"); - auto W = ctx.Input("Filter"); - auto dO = ctx.Input("DOutput"); - auto ddX = ctx.Input("DDInput"); - auto ddW = ctx.Input("DDFilter"); - - auto ddO = ctx.Output("DDOutput"); - auto dW = ctx.Output("DFilter"); - auto dX = ctx.Output("DInput"); - - if (ddO) { - ddO->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, ddO, static_cast(0)); - } - if (dW) { - dW->mutable_data(ctx.GetPlace()); - } - if (dX) { - dX->mutable_data(ctx.GetPlace()); - } - - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - const std::vector& strides = ctx.Attr>("strides"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool deterministic = FLAGS_cudnn_deterministic; - - std::vector paddings = ctx.Attr>("paddings"); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - Tensor transformed_X_channel(X->type()); - Tensor transformed_dO_channel(dO->type()); - Tensor transformed_ddX_channel(X->type()); - - Tensor transformed_ddO_channel(dO->type()); - Tensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst( - ctx, X, &transformed_X_channel); - TransToChannelFirst( - ctx, X, &transformed_X_channel); - - ResizeToChannelFirst( - ctx, dO, &transformed_dO_channel); - TransToChannelFirst( - ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst( - ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst( - ctx, dX, &transformed_dX_channel); - transformed_dX_channel.mutable_data(ctx.GetPlace()); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (dX) { - transformed_dX_channel = *dX; - } - } - std::vector output_vec = - phi::vectorize(transformed_dO_channel.dims()); - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_X(X->type()); - Tensor transformed_ddX(X->type()); - - Tensor transformed_dO(dO->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - std::vector new_output_grad_shape_vec(data_dim + 2); - - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - new_output_grad_shape_vec[0] = transformed_dO_channel.dims()[0]; - new_output_grad_shape_vec[1] = transformed_dO_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - - new_output_grad_shape_vec[i + 2] = - transformed_dO_channel.dims()[i + 2] + padding_diff[i]; - - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - - framework::DDim new_output_grad_shape( - phi::make_ddim(new_output_grad_shape_vec)); - transformed_dO.Resize(new_output_grad_shape); - - transformed_dO = - ctx.AllocateTmpTensor( - new_output_grad_shape, dev_ctx); - - transformed_X = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (ddX) { - transformed_ddX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (dO) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_dO_channel, pad_value, - &transformed_dO); - } - - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X = transformed_X_channel; - transformed_dO = transformed_dO_channel; - if (ddX) { - transformed_ddX = transformed_ddX_channel; - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - std::vector starts(data_dim, 0); - std::vector ends(data_dim, 0); - std::vector axes(data_dim, 0); - for (size_t i = 0; i < data_dim; ++i) { - starts[i] = input_pad[2 * i + 4] * (strides[i] + 1); - ends[i] = starts[i] + output_vec[i + 2]; - axes[i] = i + 2; - } - - std::vector transformed_output_vec = output_vec; - for (size_t i = 0; i < data_dim; ++i) { - transformed_output_vec[i + 2] = - output_vec[i + 2] + - (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] - - 2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1]; - } - - if (!is_sys_pad) { - DDim transformed_output_shape(phi::make_ddim(transformed_output_vec)); - transformed_ddO_channel.mutable_data(transformed_output_shape, - ctx.GetPlace()); - } else { - ddO->mutable_data(ctx.GetPlace()); - transformed_ddO_channel = *ddO; - transformed_ddO_channel.Resize(phi::make_ddim(transformed_output_vec)); - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = platform::CudnnDataType::type; - - auto handle = dev_ctx.cudnn_handle(); - - ConvArgs args1{&transformed_ddO_channel, - W, - &transformed_ddX, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_ddO_channel, ddW, &transformed_X, strides, - padding_common, dilations, dtype}; - - ConvArgs args3{&transformed_dO, - dW, - &transformed_ddX_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args4{ - &transformed_dO, ddW, &transformed_dX_channel, strides, padding_common, - dilations, dtype}; -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t bwd_algo1 = - static_cast(0); - miopenConvBwdDataAlgorithm_t bwd_algo2 = - static_cast(0); - miopenConvFwdAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionBwdDataAlgo_t bwd_algo1 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t bwd_algo2 = - static_cast(0); - cudnnConvolutionFwdAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - - auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW); - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddO_channel, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddX, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = search1::Find( - args1, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search1 = SearchAlgorithm; - bwd_algo1 = search1::Find( - args1, false, deterministic, - ctx.template device_context()); - workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_ddO_channel, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_X, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = search2::Find( - args2, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search2 = SearchAlgorithm; - bwd_algo2 = search2::Find( - args2, false, deterministic, - ctx.template device_context()); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, bwd_algo2)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_dO, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - - args3.odesc.set(transformed_ddX_channel, iwo_group); - - args3.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find( - args3, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search3 = SearchAlgorithm; - filter_algo = search3::Find( - args3, false, deterministic, - ctx.template device_context()); - workspace_size = std::max(workspace_size, - search3::GetWorkspaceSize(args3, filter_algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX_channel.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dO, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dX_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find( - args4, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search4 = SearchAlgorithm; - data_algo = search4::Find( - args4, false, deterministic, - ctx.template device_context()); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c, - &i_d, &i_h, &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c, - &o_d, &o_h, &o_w); - - int group_offset_in = - transformed_X.numel() / transformed_X.dims()[0] / groups; - int group_offset_out = - transformed_dO.numel() / transformed_dO.dims()[0] / groups; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - bwd_algo1, &beta, args1.idesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - w + i * group_offset_filter, args1.odesc.desc(), - ddx + i * group_offset_in, args1.cdesc.desc(), - bwd_algo1, workspace_ptr, workspace_size, &beta, - args1.idesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - } - if (ddW) { - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - Tensor conv_x_ddw(dO->type()); - conv_x_ddw.Resize(transformed_ddO_channel.dims()); - T* conv_x_ddw_data = conv_x_ddw.mutable_data(ctx.GetPlace()); - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args2.odesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - bwd_algo2, &beta, args2.idesc.desc(), - conv_x_ddw_data + i * group_offset_out, workspace_ptr, - workspace_size)); - }, - workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( - handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out, &alpha, - args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta, - args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out)); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.odesc.desc(), - x + i * group_offset_in, args2.cdesc.desc(), bwd_algo2, - workspace_ptr, workspace_size, &alpha, - args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - } - if ((!is_sys_pad) && (!channel_last)) { - if (strides.size() == 2U) { - Slice( - ctx, &transformed_ddO_channel, ddO, starts, ends, axes); - } else if (!is_sys_pad && strides.size() == 3U) { - Slice( - ctx, &transformed_ddO_channel, ddO, starts, ends, axes); - } - } else if ((!is_sys_pad) && (channel_last)) { - if (strides.size() == 2U) { - Slice( - ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts, - ends, axes); - } else if (!is_sys_pad && strides.size() == 3U) { - Slice( - ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts, - ends, axes); - } - - TransToChannelLast( - ctx, &transformed_ddO_channel, ddO); - } - } - - T* transformed_dy_channel = transformed_dO.data(); - if (dW && ddX) { - ddx = transformed_ddX_channel.data(); - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), - ddx + i * group_offset_in, args3.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, &beta, - args3.wdesc.desc(), dw + i * group_offset_filter, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.odesc.desc(), ddx + i * group_offset_in, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - } - - if (dX && ddW) { - ddw = ddW->data(); - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args4.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.wdesc.desc(), ddw + i * group_offset_filter, - args4.cdesc.desc(), data_algo, &beta, args4.odesc.desc(), - transformed_dx + i * group_offset_in, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args4.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.wdesc.desc(), ddw + i * group_offset_filter, - args4.cdesc.desc(), data_algo, workspace_ptr, - workspace_size, &beta, args4.odesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_dX_channel, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -#else -REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -#endif diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 86532664985b4f985099c44d36c2409e8d955132..fe76fc3aebbc173e4d916d2d2217a8d2922d169e 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -13,13 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" -#include + #include #include #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" - +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -29,165 +33,6 @@ namespace operators { using DataLayout = framework::DataLayout; -void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ConvTranspose"); - OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "ConvTranspose"); - OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ConvTranspose"); - - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - std::vector output_size = - ctx->Attrs().Get>("output_size"); - std::vector output_padding = - ctx->Attrs().Get>("output_padding"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - std::vector dilations = ctx->Attrs().Get>("dilations"); - int groups = ctx->Attrs().Get("groups"); - std::string padding_algorithm = - ctx->Attrs().Get("padding_algorithm"); - const std::string data_layout_str = - ctx->Attrs().Get("data_format"); - const DataLayout data_layout = - ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW - : framework::StringToDataLayout(data_layout_str); - - PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true, - platform::errors::InvalidArgument( - "Input of Op(conv_transpose) should be 4-D or " - "5-D Tensor. But received: %u-D Tensor, " - "the shape of input is [%s]", - in_dims.size(), in_dims)); - PADDLE_ENFORCE_EQ( - in_dims.size(), filter_dims.size(), - platform::errors::InvalidArgument( - "The input's dimension size and filter's dimension size of " - "Op (conv_transpose) should be equal. But received: the shape of " - "input is [%s], the dimension size of input is [%d], the shape " - "of filter is [%s], the dimension size of filter is [%d]. ", - in_dims, in_dims.size(), filter_dims, filter_dims.size())); - - int stride_size = strides.size(); - for (int i = 0; i < stride_size; ++i) { - PADDLE_ENFORCE_GT( - strides[i], 0, - platform::errors::InvalidArgument( - "The stride of Op(Conv) should be larget than 0, but received " - "stride is %d.", - strides[i])); - } - - int in_sub_stride_size = in_dims.size() - stride_size; - - PADDLE_ENFORCE_EQ( - in_dims.size() - strides.size(), 2U, - platform::errors::InvalidArgument( - "The input's dimension size minus Attr(stride)'s size must " - "be euqal to 2 for Op(conv_transpose). But received: [%d], the " - "input's dimension size is [%d], the shape of input " - "is [%s], the Attr(stride)'s size is [%d].", - in_sub_stride_size, in_dims.size(), in_dims, strides.size())); - if (output_size.size()) - PADDLE_ENFORCE_EQ( - output_size.size(), strides.size(), - platform::errors::InvalidArgument( - "The Attr(output_size) and Attr(stride) of Op(conv_transpose) " - "should be the same.")); - if (output_padding.size()) - PADDLE_ENFORCE_EQ( - output_padding.size(), strides.size(), - platform::errors::InvalidArgument( - "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) " - "should be the same.")); - - const int64_t C = - (data_layout != DataLayout::kNHWC ? in_dims[1] - : in_dims[in_dims.size() - 1]); - PADDLE_ENFORCE_EQ( - C, filter_dims[0], - platform::errors::InvalidArgument( - "The number of input channels should be equal to filter channels " - "for Op(conv_transpose). But received: the input's channels is " - "[%d], the shape of input is [%s], the filter's channels is [%d], " - "the shape of filter is [%s]. The data_format is %s." - "The error may come from wrong data_format setting.", - C, in_dims, filter_dims[0], filter_dims, data_layout_str)); - - framework::DDim in_data_dims; - if (data_layout != DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - std::vector output_shape({in_dims[0]}); - if (data_layout != DataLayout::kNHWC) { - output_shape.push_back(filter_dims[1] * groups); - } - const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - auto infer_shape = (ctx->IsRuntime() || in_dims[i + offset] > 0) - ? (in_dims[i + offset] - 1) * strides[i] - - paddings[2 * i] - paddings[2 * i + 1] + - filter_extent - : -1; - if (output_size.size()) { - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - output_size[i], infer_shape, - platform::errors::InvalidArgument( - "output_size of Op(ConvTransposeOp) should not be " - "less than the infered output size. But received output_size = " - "[%s], whose dim %d is less than the infered output size [%s]", - phi::make_ddim(output_size).to_str(), i, infer_shape)); - PADDLE_ENFORCE_LT( - output_size[i], infer_shape + strides[i], - platform::errors::InvalidArgument( - "output_size of Op(ConvTransposeOp) should be less " - "than infered size + stride. But received output_size = [%s], " - "whose dim %d is not less than the infered output size (%d) + " - "stride (%d) = %d", - phi::make_ddim(output_size).to_str(), i, infer_shape, - strides[i], infer_shape + strides[i])); - } - output_shape.push_back(output_size[i]); - } else if (output_padding.size()) { - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - output_padding[i], 0, - platform::errors::InvalidArgument( - "output_padding of Op(ConvTransposeOp) should not be " - "less than the 0. But received output_padding = " - "[%s], whose dim %d is less than 0", - phi::make_ddim(output_padding).to_str(), i)); - PADDLE_ENFORCE_LT( - output_padding[i], std::max(strides[i], dilations[i]), - platform::errors::InvalidArgument( - "output_padding of Op(ConvTransposeOp) should be less " - "than either stride or dilation. But received output_size = " - "[%s], " - "whose dim %d is not less than either stride (%d) or " - "dilation (%d)", - phi::make_ddim(output_size).to_str(), i, strides[i], - dilations[i])); - } - output_shape.push_back((infer_shape + output_padding[i])); - } else { - output_shape.push_back(infer_shape); - } - } - if (data_layout == DataLayout::kNHWC) { - output_shape.push_back(filter_dims[1] * groups); - } - ctx->SetOutputDim("Output", phi::make_ddim(output_shape)); -} - framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; @@ -217,7 +62,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( } framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const { #ifdef PADDLE_WITH_MKLDNN // Only input require reshaping, weights and @@ -493,17 +338,6 @@ Example: )DOC"); } -void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const { - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - if (ctx->HasOutput(framework::GradVarName("Input"))) { - ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); - } - if (ctx->HasOutput(framework::GradVarName("Filter"))) { - ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); - } -} - framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { bool use_cudnn = @@ -587,24 +421,6 @@ class ConvTransposeDoubleGradMaker : public framework::SingleGradOpMaker { } }; -void ConvTransposeOpDoubleGrad::InferShape( - framework::InferShapeContext* ctx) const { - auto x_dims = ctx->GetInputDim("Input"); - auto w_dims = ctx->GetInputDim("Filter"); - auto do_dims = ctx->GetInputDim("DOutput"); - - if (ctx->HasOutput("DDOutput") && - (ctx->HasInput("DDInput") || (ctx->HasInput("DDFilter")))) { - ctx->SetOutputDim("DDOutput", do_dims); - } - if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) { - ctx->SetOutputDim("DFilter", w_dims); - } - if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) { - ctx->SetOutputDim("DInput", x_dims); - } -} - framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { bool use_cudnn = @@ -635,59 +451,57 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( namespace ops = paddle::operators; // conv2d_transpose +DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose, Conv2dTranposeInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose_grad, + Conv2dTranposeGradInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( + conv2d_transpose_grad_grad, Conv2dTranposeDoubleGradInferShapeFunctor, + PD_INFER_META(phi::Conv2dTransposeDoubleGradInferMeta)); + REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, ops::ConvTransposeGradOpMaker, - ops::ConvTransposeGradOpMaker); -REGISTER_OPERATOR( - conv2d_transpose_grad, ops::ConvTransposeOpGrad, - ops::ConvTransposeDoubleGradMaker, - ops::ConvTransposeDoubleGradMaker); -REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad); - -REGISTER_OP_CPU_KERNEL( - conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::ConvTransposeGradOpMaker, + Conv2dTranposeInferShapeFunctor); +REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad, + ops::ConvTransposeDoubleGradMaker, + ops::ConvTransposeDoubleGradMaker, + Conv2dTranposeGradInferShapeFunctor); +REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad, + Conv2dTranposeDoubleGradInferShapeFunctor); // conv3d_transpose +DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose, Conv3dTranposeInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose_grad, + Conv3dTranposeGradInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeGradInferMeta)); + REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, ops::ConvTransposeGradOpMaker, - ops::ConvTransposeGradOpMaker); -REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::ConvTransposeGradOpMaker, + Conv3dTranposeInferShapeFunctor); +REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad, + Conv3dTranposeGradInferShapeFunctor); // depthwise conv2d_transpose +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose, + DepthWiseConv2dTranposeInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose_grad, + DepthWiseConv2dTranposeGradInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeGradInferMeta)); + REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, ops::ConvTransposeGradOpMaker, - ops::ConvTransposeGradOpMaker); -REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::ConvTransposeGradOpMaker, + DepthWiseConv2dTranposeInferShapeFunctor); +REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad, + DepthWiseConv2dTranposeGradInferShapeFunctor); REGISTER_OP_VERSION(conv_transpose) .AddCheckpoint( diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu deleted file mode 100644 index 054cb4b33895b02a816cc2bff82b1c9052bc645d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_transpose_op.cu +++ /dev/null @@ -1,185 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/phi/kernels/gpu/depthwise_conv.h" - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; - -template -class DepthwiseConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - int groups = context.Attr("groups"); - PADDLE_ENFORCE_EQ( - groups, filter.dims()[0], - platform::errors::InvalidArgument( - "groups should be error to the 1st dimension of filter. But " - "received groups is %d and filter dimension[0] is %d", - groups, filter.dims()[0])); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - for (auto v : dilations) { - PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( - "dilations should be 1 in depthwise conv. " - "But received dilations is %d", - v)); - } - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - output->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad( - static_cast::TYPE&>(dev_ctx), - *output, filter, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, output, data_layout); - } -}; - -template -class DepthwiseConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - auto& dev_ctx = context.template device_context(); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - if (input_grad) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv( - static_cast::TYPE&>(dev_ctx), - *output_grad, filter, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, input_grad, data_layout); - } - - if (filter_grad) { - phi::funcs::SetConstant set_zero; - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad( - static_cast::TYPE&>(dev_ctx), - *output_grad, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, filter_grad, data_layout); - } - } -}; - -} // namespace operators -} // namespace paddle -// conv2d -REGISTER_OP_CUDA_KERNEL(conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); -REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); - -// conv3d -REGISTER_OP_CUDA_KERNEL(conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); - -// depthwise conv2d -REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose, - ops::DepthwiseConvTransposeKernel, - ops::DepthwiseConvTransposeKernel); -REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad, - ops::DepthwiseConvTransposeGradKernel, - ops::DepthwiseConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index ee0fb7ab3683364f6db3cffd7ddef67c61f19433..ac95dceb8280cdee6d2fcafa686d951ad8866efc 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -13,72 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; - -template -static void Slice(const framework::ExecutionContext& context, - const Tensor* input, Tensor* out, - const std::vector& begin_vec, - const std::vector& end_vec, - const std::vector& axes_vec) { - auto& place = - *context.template device_context().eigen_device(); - auto in_dims = input->dims(); - auto offsets = Eigen::DSizes(); - auto extents = Eigen::DSizes(); - for (size_t i = 0; i < D; ++i) { - offsets[i] = 0; - extents[i] = in_dims[i]; - } - - std::vector out_shape_vec = phi::vectorize(in_dims); - for (size_t i = 0; i < axes_vec.size(); ++i) { - offsets[axes_vec[i]] = begin_vec[i]; - extents[axes_vec[i]] = end_vec[i] - begin_vec[i]; - out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i]; - } - - framework::DDim out_dims(phi::make_ddim(out_shape_vec)); - out->mutable_data(out_dims, context.GetPlace()); - - auto in_t = - framework::EigenTensor::From( - *input); - auto out_t = - framework::EigenTensor::From( - *out, out_dims); - - EigenSlice, T, D>::Eval(place, out_t, in_t, - offsets, extents); - out->Resize(out_dims); -} - -template -static void Slice(const framework::ExecutionContext& context, - const Tensor* input, Tensor* out, int64_t begin_idx, - int64_t end_idx, int64_t axes) { - std::vector begin_vec = {begin_idx}; - std::vector end_vec = {end_idx}; - std::vector axes_vec = {axes}; - Slice(context, input, out, begin_vec, end_vec, axes_vec); -} - // Define Op classes in .h file so that other conv transpose // operator implementations can reuse the code. class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { @@ -94,21 +36,19 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { class ConvTransposeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override; }; class ConvTransposeOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( @@ -118,464 +58,11 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel { class ConvTransposeOpDoubleGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; }; -template -class GemmConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - // The filter will be reshaped, so it should not be constant pointer - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - int groups = context.Attr("groups"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - auto out_dims = output->dims(); - const int batch_size = static_cast(input->dims()[0]); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first - // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last - std::vector input_shape_vec = phi::vectorize(input->dims()); - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec = phi::vectorize(filter.dims()); - - // use col_shape in the im2col and col2im (or vol2col and col2vol) - // calculation - // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - if (data_layout != framework::DataLayout::kNHWC) { - col_shape_vec[0] = out_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; - } - } else { - col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1]; - } - } - DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w) - DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1); - - Tensor col; - col.mutable_data(col_shape, context.GetPlace()); - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - - // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - DDim output_shape = - phi::slice_ddim(output->dims(), 1, output->dims().size()); - - // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first - // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last - DDim input_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - input_matrix_shape = {in_dims[1], col_matrix_shape[1]}; - } else { - input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]}; - } - - // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w) - DDim filter_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - filter_matrix_shape = {in_dims[1], col_matrix_shape[0]}; - } else { - filter_matrix_shape = {in_dims[in_dims.size() - 1], col_matrix_shape[0]}; - } - filter.Resize(filter_matrix_shape); - - output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - set_zero(dev_ctx, output, static_cast(0)); - - int in_step = - (data_layout != framework::DataLayout::kNHWC - ? static_cast(in_dims[1]) / groups - : static_cast(in_dims[in_dims.size() - 1]) / groups); - - int out_step = - (data_layout != framework::DataLayout::kNHWC - ? static_cast(out_dims[1]) / groups - : static_cast(out_dims[out_dims.size() - 1]) / groups); - math::Col2ImFunctor col2im; - math::Col2VolFunctor col2vol; - math::ConcatFunctor concat_functor; - - // convolution transpose: gemm + col2im or col2vol (similar to conv-backward - // on input) - size_t D = input->dims().size(); - for (int i = 0; i < batch_size; i++) { - // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first - // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last - Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); - - // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); - - std::vector output_batch_vec; - for (int g = 0; g < groups; g++) { - int64_t start = g * in_step; - int64_t end = (g + 1) * in_step; - int axes = (data_layout != framework::DataLayout::kNHWC ? 0 : 1); - Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); - Tensor in_slice, out_slice; - - // col_matrix = filter_slice * input_slice - // of shape (o_c/g * k_h * k_w, h * w) - // or (o_c/g * k_d * k_h * k_w, d * h * w) - if (data_layout != framework::DataLayout::kNHWC) { - in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step); - out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, true, in_slice, false, static_cast(1.0), - &col_matrix, static_cast(0.0)); - } else { - Slice(context, &input_batch, &in_slice, start, - end, axes); - start = g * out_step; - end = (g + 1) * out_step; - axes = D - 2; - if (D == 4U) { - Slice(context, &output_batch, &out_slice, - start, end, axes); - } else if (D == 5U) { - Slice(context, &output_batch, &out_slice, - start, end, axes); - } - blas.MatMul(filter_slice, true, in_slice, true, static_cast(1.0), - &col_matrix, static_cast(0.0)); - } - - if (data_dim == 2U) { - // col2im: col_matrix -> dy - // from (o_c/g * k_h * k_w, h * w) to (o_c/g, o_h, o_w) or (o_h, o_w, - // o_c/g) - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &out_slice, data_layout); - } else if (data_dim == 3U) { - // col2vol: col_matrix -> dy - // from (o_c/g * k_d * k_h * k_w, d * h * w) to (o_c/g, o_d, o_h, o_w) - // or (o_d, o_h, o_w, o_c/g) - col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice, - data_layout); - } - if (data_layout == framework::DataLayout::kNHWC) { - output_batch_vec.push_back(out_slice); - } - } - if (data_layout == framework::DataLayout::kNHWC) { - concat_functor(dev_ctx, output_batch_vec, static_cast(D - 2), - &output_batch); - } - } - } -}; - -template -class GemmConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - // For filter, we do not use const pointer b/c we will do reshape, - // but we should avoid modifying its value. - Tensor filter = *context.Input("Filter"); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - - if ((!input_grad) && (!filter_grad)) return; - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - int groups = context.Attr("groups"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - auto out_grad_dims = output_grad->dims(); - const int batch_size = static_cast(input->dims()[0]); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first - // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last - std::vector input_shape_vec = phi::vectorize(input->dims()); - // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w} - std::vector filter_shape_vec = phi::vectorize(filter.dims()); - - // use col_shape in the im2col and col2im (or vol2col and col2vol) - // calculation - // col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - if (data_layout != framework::DataLayout::kNHWC) { - col_shape_vec[0] = out_grad_dims[1]; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; - } - } else { - col_shape_vec[0] = out_grad_dims[out_grad_dims.size() - 1]; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1]; - } - } - DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w) - DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1); - - // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - DDim output_shape = - phi::slice_ddim(output_grad->dims(), 1, output_grad->dims().size()); - - // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first - // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last - DDim input_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - input_matrix_shape = {in_dims[1], col_matrix_shape[1]}; - } else { - input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]}; - } - - // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w) - DDim filter_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - filter_matrix_shape = {in_dims[1], col_matrix_shape[0] / groups}; - } else { - filter_matrix_shape = {in_dims[in_dims.size() - 1], - col_matrix_shape[0] / groups}; - } - filter.Resize(filter_matrix_shape); - - int in_step = - (data_layout != framework::DataLayout::kNHWC - ? static_cast(in_dims[1]) / groups - : static_cast(in_dims[in_dims.size() - 1]) / groups); - int col_step = static_cast(col_matrix_shape[0]) / groups; - - // convolution transpose grad on input: - // im2col + gemm (similar to conv-forward) - // input need to compute gradient - auto& dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - if (input_grad || filter_grad) { - Tensor col; - col.mutable_data(col_shape, context.GetPlace()); - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - - Tensor filter_grad_; - phi::funcs::SetConstant set_zero; - - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - math::ConcatFunctor concat_functor; - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, input_grad, static_cast(0)); - } - if (filter_grad) { // filter_grad_ size (i_c, o_c/g, k_h, k_w) - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - } - - size_t D = input->dims().size(); - for (int i = 0; i < batch_size; i++) { - // batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for - // channel_first - // batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for - // channel_last - Tensor output_grad_batch = - output_grad->Slice(i, i + 1).Resize(output_shape); - - if (data_dim == 2U) { - // im2col: dy -> col matrix - // from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for - // channel_first - // from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for - // channel_last - im2col(dev_ctx, output_grad_batch, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col, data_layout); - } else if (data_dim == 3U) { - // vol2col: dy -> col_matrix - // from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h * - // i_w) for channel_first - // from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h * - // k_w) for channel_last - vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, - &col, data_layout); - } - - if (input_grad) { - // batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c) - Tensor input_grad_batch = - input_grad->Slice(i, i + 1).Resize(input_matrix_shape); - - // gemm: dx = filter * dy - // (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h - // * i_w) - // or - // (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h * - // i_w) -> (i_c, - // i_d, i_h, i_w) - // gemm: dx = dy^T * filter^T for channel_last - - std::vector input_grad_batch_vec; - for (int g = 0; g < groups; g++) { - // input_grad_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w) - // for channel_first - // input_grad_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g) - // for channel_last - // filter_slice: (i_c/g, o_c/g * k_h * k_w) - Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); - // col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * - // k_h * k_w, d * h * w) - Tensor col_matrix_slice = - col_matrix.Slice(g * col_step, (g + 1) * col_step); - if (data_layout != framework::DataLayout::kNHWC) { - Tensor input_grad_slice = - input_grad_batch.Slice(g * in_step, (g + 1) * in_step); - blas.MatMul(filter_slice, false, col_matrix_slice, false, - static_cast(1.0), &input_grad_slice, - static_cast(0.0)); - } else { - Tensor input_grad_slice; - Slice(context, &input_grad_batch, - &input_grad_slice, g * in_step, - (g + 1) * in_step, 1); - blas.MatMul(col_matrix_slice, true, filter_slice, true, - static_cast(1.0), &input_grad_slice, - static_cast(0.0)); - DDim input_grad_slice_shape; - if (data_dim == 2U) { - input_grad_slice_shape = {in_dims[1], in_dims[2], in_step}; - } else { - input_grad_slice_shape = {in_dims[1], in_dims[2], in_dims[3], - in_step}; - } - input_grad_slice = - input_grad_slice.Resize(input_grad_slice_shape); - input_grad_batch_vec.push_back(input_grad_slice); - } - } - if (data_layout == framework::DataLayout::kNHWC) { - concat_functor(dev_ctx, input_grad_batch_vec, - static_cast(D - 2), &input_grad_batch); - } - } - if (filter_grad) { - // input batch: (i_c, i_h * i_w) or (i_h, i_w * i_c) - Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); - // gemm: d_filter = x * dy^T - // (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h - // * k_w) - // or - // (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w) - // -> (i_c, o_c * k_d * - // k_h * k_w) - // gemm: d_filter = x^T * dy^T for channel_last - - for (int g = 0; g < groups; g++) { - Tensor filter_grad_slice = - filter_grad_.Slice(g * in_step, (g + 1) * in_step); - Tensor col_matrix_slice = - col_matrix.Slice(g * col_step, (g + 1) * col_step); - if (data_layout != framework::DataLayout::kNHWC) { - Tensor in_batch_slice = - in_batch.Slice(g * in_step, (g + 1) * in_step); - blas.MatMul(in_batch_slice, false, col_matrix_slice, true, - static_cast(1.0), &filter_grad_slice, - static_cast(1.0)); - } else { - Tensor in_batch_slice; - Slice(context, &in_batch, &in_batch_slice, - g * in_step, (g + 1) * in_step, 1); - blas.MatMul(in_batch_slice, true, col_matrix_slice, true, - static_cast(1.0), &filter_grad_slice, - static_cast(1.0)); - } - } - } - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc index 7d0ebf21829c21d06889c291aec2a53f4badc5d4..050ede78f72cfea7c7e20829d530167885181798 100644 --- a/paddle/fluid/operators/conv_transpose_op_npu.cc +++ b/paddle/fluid/operators/conv_transpose_op_npu.cc @@ -13,11 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/cpu/conv_util.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; using NPUDeviceContext = platform::NPUDeviceContext; template @@ -55,8 +59,8 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel { filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, - in_data_dims, stride, ksize); + phi::UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, + in_data_dims, stride, ksize); // construct NPU attr std::vector strides(4, 1); @@ -137,8 +141,8 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel { framework::DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); + phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); std::vector strides_vec(4, 1); std::vector dilations_vec(4, 1); diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc index 12e1739f2a267582602f300e0f4ea8593b8c870a..b8bd3c4f006087273e1ae139d42d86891aabad1c 100644 --- a/paddle/fluid/operators/conv_transpose_op_xpu.cc +++ b/paddle/fluid/operators/conv_transpose_op_xpu.cc @@ -8,15 +8,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include "paddle/fluid/operators/conv_transpose_op.h" + #include #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/kernels/cpu/conv_util.h" + #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { +using Tensor = framework::Tensor; + // target_len == 2 || target_len == 4 inline std::vector vector_extend(const std::vector& src, int target_len) { @@ -61,8 +68,8 @@ class Conv2DTransposeXPUKernel : public framework::OpKernel { framework::DDim filter_data_dims = phi::slice_ddim(filter.dims(), 2, filter.dims().size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); + phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); const int batch_size = static_cast(input->dims()[0]); const int img_yc = static_cast(input->dims()[1]); @@ -135,8 +142,8 @@ class Conv2DTransposeGradXPUKernel : public framework::OpKernel { framework::DDim filter_data_dims = phi::slice_ddim(filter.dims(), 2, filter.dims().size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); + phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); const int batch_size = static_cast(input->dims()[0]); const int img_yc = static_cast(input->dims()[1]); diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index c9308d27c0a3490d9c0094f45a1a9c2d894bbf57..e1861b2f7c5eade41025dd25cd603619759e581f 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -243,8 +243,6 @@ class ConcatFunctor { const int axis_t = axis; const int ins_size_t = ins_size; - auto place = context.GetPlace(); - output->mutable_data(place); // mlu should do sth // init ins tensors @@ -295,7 +293,6 @@ class SplitFunctor { std::vector desc_vector; for (size_t i = 0; i < out_size; i++) { (*outputs)[i]->Resize(outs_dims[i]); - (*outputs)[i]->mutable_data(context.GetPlace()); output_descs.emplace_back( MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType((*outputs)[i]->dtype()))); diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index 7b9a4ab1557bf0ce0ed2bd348298373f0ba672cf..e4952a243262bedc5477908cd8aedeb158e344b8 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -24,734 +26,10 @@ namespace operators { using framework::Tensor; -template -void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, const int out_d, - const int out_h, const int out_w, const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) - ? value - : in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w, const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) { - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = value; - } - } else { - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } - } -} - -template -void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w, - const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); // reflect by 0 - in_d = std::min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth - in_h = std::max(in_h, -in_h); // reflect by 0 - in_h = std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height - in_w = std::max(in_w, -in_w); // reflect by 0 - in_w = std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width - - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w, const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); - in_d = std::min(in_d, 2 * in_depth - in_d - 2); - in_h = std::max(in_h, -in_h); - in_h = std::min(in_h, 2 * in_height - in_h - 2); - in_w = std::max(in_w, -in_w); - in_w = std::min(in_w, 2 * in_width - in_w - 2); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } -} - -template -void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w, - const T value) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w, const T value) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } -} - -template -void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w, - const T value) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w, const T value) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } -} - -template -void Pad3DNCDHW(const T* in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - T value, T* out_data, - void (*pad_func)(const T*, T*, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const T)) { - for (int n = 0; n < num; ++n) { - for (int c = 0; c < channels; ++c) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(in_data, out_data, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, out_d, out_h, out_w, value); - } - } - } - in_data += in_depth * in_height * in_width; - out_data += out_depth * out_height * out_width; - } - } -} - -template -void Pad3DNDHWC(const T* in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - T value, T* out_data, - void (*pad_func)(const T*, T*, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const T)) { - for (int n = 0; n < num; ++n) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(in_data, out_data, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, out_d, out_h, out_w, value); - } - } - } - in_data += in_depth * in_height * in_width * channels; - out_data += out_depth * out_height * out_width * channels; - } -} - -template -void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, const int out_d, - const int out_h, const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width)) { - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] = - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; - } -} - -template -void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width)) { - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] = d_out_data[out_index + c]; - } - } -} - -template -void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); // reflect by 0 - in_d = std::min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth - in_h = std::max(in_h, -in_h); // reflect by 0 - in_h = std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height - in_w = std::max(in_w, -in_w); // reflect by 0 - in_w = std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width - - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; -} - -template -void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data, - const int channels, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); - in_d = std::min(in_d, 2 * in_depth - in_d - 2); - in_h = std::max(in_h, -in_h); - in_h = std::min(in_h, 2 * in_height - in_h - 2); - in_w = std::max(in_w, -in_w); - in_w = std::min(in_w, 2 * in_width - in_w - 2); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] += d_out_data[out_index + c]; - } -} - -template -void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; -} - -template -void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data, - const int channels, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, - const int out_w) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] += d_out_data[out_index + c]; - } -} - -template -void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; -} - -template -void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data, - const int channels, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] += d_out_data[out_index + c]; - } -} - -template -void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data, - void (*pad_func)(T*, const T*, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const int)) { - for (int n = 0; n < num; ++n) { - for (int c = 0; c < channels; ++c) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(d_in_data, d_out_data, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, out_d, out_h, out_w); - } - } - } - d_in_data += in_depth * in_height * in_width; - d_out_data += out_depth * out_height * out_width; - } - } -} - -template -void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data, - void (*pad_func)(T*, const T*, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const int)) { - for (int n = 0; n < num; ++n) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(d_in_data, d_out_data, channels, in_depth, in_height, - in_width, out_depth, out_height, out_width, pad_front, - pad_top, pad_left, out_d, out_h, out_w); - } - } - } - d_in_data += in_depth * in_height * in_width * channels; - d_out_data += out_depth * out_height * out_width * channels; - } -} - -static inline std::vector GetPaddings( - const framework::ExecutionContext& context) { - std::vector paddings(6); - auto* paddings_t = context.Input("Paddings"); - if (paddings_t) { - auto paddings_data = paddings_t->data(); - std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int)); - } else { - auto pads = context.Attr>("paddings"); - std::copy(pads.begin(), pads.end(), paddings.data()); - } - return paddings; -} - -template -class Pad3dCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - T value = static_cast(context.Attr("value")); - - auto* x = context.Input("X"); - auto in_dims = x->dims(); - const T* in_data = x->data(); - - auto* out = context.Output("Out"); - if (data_format == "NCDHW") { - out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5], - in_dims[3] + pads[2] + pads[3], - in_dims[4] + pads[0] + pads[1]}); - } else { - out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5], - in_dims[2] + pads[2] + pads[3], - in_dims[3] + pads[0] + pads[1], in_dims[4]}); - } - auto out_dims = out->dims(); - T* out_data = out->mutable_data(context.GetPlace()); - - int channels = in_dims[1]; - int in_depth = in_dims[2]; - int in_height = in_dims[3]; - int in_width = in_dims[4]; - int out_depth = out_dims[2]; - int out_height = out_dims[3]; - int out_width = out_dims[4]; - if (data_format == "NDHWC") { - channels = in_dims[4]; - in_depth = in_dims[1]; - in_height = in_dims[2]; - in_width = in_dims[3]; - out_depth = out_dims[1]; - out_height = out_dims[2]; - out_width = out_dims[3]; - } - - if (mode == "reflect") { - PADDLE_ENFORCE_GT(in_depth, pads[4], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_front" - " in reflect mode" - ", but received depth(%d) and pad_front(%d).", - in_depth, pads[4])); - PADDLE_ENFORCE_GT(in_depth, pads[5], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_back" - " in reflect mode" - ", but received depth(%d) and pad_back(%d).", - in_depth, pads[5])); - - PADDLE_ENFORCE_GT(in_height, pads[2], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_top" - " in reflect mode" - ", but received depth(%d) and pad_top(%d).", - in_height, pads[2])); - PADDLE_ENFORCE_GT(in_height, pads[3], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_bottom" - " in reflect mode" - ", but received depth(%d) and pad_bottom(%d).", - in_height, pads[3])); - - PADDLE_ENFORCE_GT(in_width, pads[0], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_left" - " in reflect mode" - ", but received depth(%d) and pad_left(%d).", - in_width, pads[0])); - PADDLE_ENFORCE_GT(in_width, pads[1], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_right" - " in reflect mode" - ", but received depth(%d) and pad_right(%d).", - in_width, pads[1])); - } else if (mode == "circular" || mode == "replicate") { - PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular " - "or replicate padding mode.")); - } - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - const int num = in_dims[0]; - if (data_format == "NCDHW") { - std::map - func_map; - - func_map["reflect"] = ReflectPad3DFuncNCDHW; - func_map["replicate"] = ReplicatePad3DFuncNCDHW; - func_map["circular"] = CircularPad3DFuncNCDHW; - func_map["constant"] = ConstPad3DFuncNCDHW; - Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data, func_map[mode]); - } else { - std::map - func_map; - - func_map["reflect"] = ReflectPad3DFuncNDHWC; - func_map["replicate"] = ReplicatePad3DFuncNDHWC; - func_map["circular"] = CircularPad3DFuncNDHWC; - func_map["constant"] = ConstPad3DFuncNDHWC; - Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data, func_map[mode]); - } - } -}; - -template -class Pad3dGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_in = context.Output(framework::GradVarName("X")); - auto d_in_dims = d_in->dims(); - auto d_out_dims = d_out->dims(); - const T* d_out_data = d_out->data(); - T* d_in_data = d_in->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(context.template device_context(), - d_in, static_cast(0)); - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - const int num = d_in_dims[0]; - if (data_format == "NCDHW") { - const int channels = d_in_dims[1]; - const int in_depth = d_in_dims[2]; - const int in_height = d_in_dims[3]; - const int in_width = d_in_dims[4]; - const int out_depth = d_out_dims[2]; - const int out_height = d_out_dims[3]; - const int out_width = d_out_dims[4]; - - std::map - func_map; - - func_map["reflect"] = ReflectPad3DGradNCDHW; - func_map["replicate"] = ReplicatePad3DGradNCDHW; - func_map["circular"] = CircularPad3DGradNCDHW; - func_map["constant"] = ConstPad3DGradNCDHW; - - Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, d_out_data, func_map[mode]); - } else { - const int channels = d_in_dims[4]; - const int in_depth = d_in_dims[1]; - const int in_height = d_in_dims[2]; - const int in_width = d_in_dims[3]; - const int out_depth = d_out_dims[1]; - const int out_height = d_out_dims[2]; - const int out_width = d_out_dims[3]; - - std::map - func_map; - - func_map["reflect"] = ReflectPad3DGradNDHWC; - func_map["replicate"] = ReplicatePad3DGradNDHWC; - func_map["circular"] = CircularPad3DGradNDHWC; - func_map["constant"] = ConstPad3DGradNDHWC; - - Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, d_out_data, func_map[mode]); - } - } -}; - class Pad3dOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d"); - - auto x_dim = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ(x_dim.size(), 5, - platform::errors::InvalidArgument( - "The size of Input(X)'s dimension should be equal to " - "5, but received %d. ", - x_dim.size())); - - std::vector out_dims(x_dim.size()); - auto data_format = ctx->Attrs().Get("data_format"); - out_dims[0] = x_dim[0]; - if (ctx->HasInput("Paddings")) { - auto paddings_dim = ctx->GetInputDim("Paddings"); - PADDLE_ENFORCE_EQ(paddings_dim.size(), 1, - platform::errors::InvalidArgument( - "Size of Input(Paddings)'s dimension should be " - "equal to 1, but received %d.", - paddings_dim.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(paddings_dim[0], 6, - platform::errors::InvalidArgument( - "Shape of Input(Paddings) should be equal to " - "[6], but received [%d].", - paddings_dim[0])); - } - out_dims[1] = x_dim[1]; - out_dims[2] = x_dim[2]; - out_dims[3] = x_dim[3]; - } else { - auto paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE_EQ( - paddings.size(), 6, - platform::errors::InvalidArgument( - "Size of paddings should be equal to 4, but received %d.", - static_cast(paddings.size()))); - if (data_format == "NCDHW") { - out_dims[1] = x_dim[1]; // channel - out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0)) - ? x_dim[2] - : (x_dim[2] + paddings[4] + paddings[5]); // depth - - out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0)) - ? x_dim[3] - : (x_dim[3] + paddings[2] + paddings[3]); // height - - out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0)) - ? x_dim[4] - : (x_dim[4] + paddings[0] + paddings[1]); // width - } else { // NDHWC - out_dims[4] = x_dim[4]; // channel - - out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0)) - ? x_dim[1] - : (x_dim[1] + paddings[4] + paddings[5]); // depth - out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0)) - ? x_dim[2] - : (x_dim[2] + paddings[2] + paddings[3]); // height - out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0)) - ? x_dim[3] - : (x_dim[3] + paddings[0] + paddings[1]); // width - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -921,15 +199,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X"); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pad3d, Pad3dInferShapeFunctor, + PD_INFER_META(phi::Pad3dInferMeta)); + REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker, ops::Pad3dOpGradMaker, - ops::Pad3dOpGradMaker); + ops::Pad3dOpGradMaker, + Pad3dInferShapeFunctor); REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad, ops::Pad3dOpDoubleGradMaker, ops::Pad3dOpDoubleGradMaker, ops::Pad3dOpGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel, - ops::Pad3dCPUKernel, ops::Pad3dCPUKernel, - ops::Pad3dCPUKernel); -REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel, - ops::Pad3dGradCPUKernel); diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu deleted file mode 100644 index 9ab0eb9d445da9b1d0f64cf4f7a721026dab5476..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad3d_op.cu +++ /dev/null @@ -1,793 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; - -using framework::Tensor; - -template -__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T value, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - out_data[index] = - (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) - ? value - : in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w]; - } -} - -template -__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T value, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - const int in_d = out_d - pad_front; - const int in_h = out_h - pad_top; - const int in_w = out_w - pad_left; - - out_data[index] = - (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) - ? value - : in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); // reflect by 0 - in_d = min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth - in_h = max(in_h, -in_h); // reflect by 0 - in_h = min(in_h, 2 * in_height - in_h - 2); // reflect by in_height - in_w = max(in_w, -in_w); // reflect by 0 - in_w = min(in_w, 2 * in_width - in_w - 2); // reflect by in_width - out_data[index] = - in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * - in_width + - in_w]; - } -} - -template -__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); - in_d = min(in_d, 2 * in_depth - in_d - 2); - in_h = max(in_h, -in_h); - in_h = min(in_h, 2 * in_height - in_h - 2); - in_w = max(in_w, -in_w); - in_w = min(in_w, 2 * in_width - in_w - 2); - - out_data[index] = in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - out_data[index] = - in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * - in_width + - in_w]; - } -} - -template -__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - out_data[index] = in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - out_data[index] = - in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * - in_width + - in_w]; - } -} - -template -__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - out_data[index] = in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(in_index, in_size) { - const int in_w = in_index % in_width; - - int nc = in_index / in_width; - const int in_h = nc % in_height; - - nc /= in_height; - const int in_d = nc % in_depth; - - nc /= in_depth; - - const int out_d = in_d + pad_front; - const int out_h = in_h + pad_top; - const int out_w = in_w + pad_left; - d_in_data[in_index] = - d_out_data[nc * out_depth * out_height * out_width + - out_d * out_height * out_width + out_h * out_width + out_w]; - } -} - -template -__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(in_index, in_size) { - const int c = in_index % channels; - int n = in_index / channels; - - const int in_w = n % in_width; - n /= in_width; - - const int in_h = n % in_height; - n /= in_height; - - const int in_d = n % in_depth; - n /= in_depth; - - const int out_d = in_d + pad_front; - const int out_h = in_h + pad_top; - const int out_w = in_w + pad_left; - - d_in_data[in_index] = - d_out_data[n * out_depth * out_height * out_width * channels + - out_d * out_height * out_width * channels + - out_h * out_width * channels + out_w * channels + c]; - } -} - -template -__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - int nc = out_index / out_width; - const int out_w = out_index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); - in_h = max(in_h, -in_h); - in_w = max(in_w, -in_w); - - in_d = min(in_d, 2 * in_depth - in_d - 2); - in_h = min(in_h, 2 * in_height - in_h - 2); - in_w = min(in_w, 2 * in_width - in_w - 2); - - platform::CudaAtomicAdd( - &d_in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - const int c = out_index % channels; - int n = out_index / channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); - in_h = max(in_h, -in_h); - in_w = max(in_w, -in_w); - - in_d = min(in_d, in_depth * 2 - in_d - 2); - in_h = min(in_h, in_height * 2 - in_h - 2); - in_w = min(in_w, in_width * 2 - in_w - 2); - platform::CudaAtomicAdd( - &d_in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradReplicateNCDHW( - const int out_size, T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - int nc = out_index / out_width; - const int out_w = out_index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - const int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - platform::CudaAtomicAdd( - &d_in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradReplicateNDHWC( - const int out_size, T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - const int c = out_index % channels; - int n = out_index / channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - const int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - platform::CudaAtomicAdd( - &d_in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - int nc = out_index / out_width; - const int out_w = out_index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - platform::CudaAtomicAdd( - &d_in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - const int c = out_index % channels; - int n = out_index / channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - platform::CudaAtomicAdd( - &d_in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c], - d_out_data[out_index]); - } -} - -static inline std::vector GetPaddings( - const framework::ExecutionContext& context) { - std::vector paddings(6); - auto* paddings_data = context.Input("Paddings"); - if (paddings_data) { - Tensor pads; - framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads); - auto pads_data = pads.data(); - std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int)); - } else { - auto pads = context.Attr>("paddings"); - std::copy(pads.begin(), pads.end(), paddings.data()); - } - return paddings; -} - -template -class Pad3dCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - T value = static_cast(context.Attr("value")); - - auto* x = context.Input("X"); - auto in_dims = x->dims(); - const T* in_data = x->data(); - auto* out = context.Output("Out"); - auto out_dims = out->dims(); - if (data_format == "NCDHW") { - out_dims[0] = in_dims[0]; - out_dims[1] = in_dims[1]; - out_dims[2] = in_dims[2] + pads[4] + pads[5]; - out_dims[3] = in_dims[3] + pads[2] + pads[3]; - out_dims[4] = in_dims[4] + pads[0] + pads[1]; - } else { - out_dims[0] = in_dims[0]; - out_dims[1] = in_dims[1] + pads[4] + pads[5]; - out_dims[2] = in_dims[2] + pads[2] + pads[3]; - out_dims[3] = in_dims[3] + pads[0] + pads[1]; - out_dims[4] = in_dims[4]; - } - T* out_data = out->mutable_data(out_dims, context.GetPlace()); - - int channels = in_dims[1]; - int in_depth = in_dims[2]; - int in_height = in_dims[3]; - int in_width = in_dims[4]; - int out_depth = out_dims[2]; - int out_height = out_dims[3]; - int out_width = out_dims[4]; - if (data_format == "NDHWC") { - channels = in_dims[4]; - in_depth = in_dims[1]; - in_height = in_dims[2]; - in_width = in_dims[3]; - out_depth = out_dims[1]; - out_height = out_dims[2]; - out_width = out_dims[3]; - } - - if (mode == "reflect") { - PADDLE_ENFORCE_GT(in_depth, pads[4], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_front" - " in reflect mode" - ", but received depth(%d) and pad_front(%d).", - in_depth, pads[4])); - PADDLE_ENFORCE_GT(in_depth, pads[5], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_back" - " in reflect mode" - ", but received depth(%d) and pad_back(%d).", - in_depth, pads[5])); - - PADDLE_ENFORCE_GT(in_height, pads[2], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_top" - " in reflect mode" - ", but received depth(%d) and pad_top(%d).", - in_height, pads[2])); - PADDLE_ENFORCE_GT(in_height, pads[3], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_bottom" - " in reflect mode" - ", but received depth(%d) and pad_bottom(%d).", - in_height, pads[3])); - - PADDLE_ENFORCE_GT(in_width, pads[0], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_left" - " in reflect mode" - ", but received depth(%d) and pad_left(%d).", - in_width, pads[0])); - PADDLE_ENFORCE_GT(in_width, pads[1], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_right" - " in reflect mode" - ", but received depth(%d) and pad_right(%d).", - in_width, pads[1])); - } else if (mode == "circular" || mode == "replicate") { - PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular " - "or replicate padding mode.")); - } - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - const int num = in_dims[0]; - - auto stream = context.cuda_device_context().stream(); - int block = PADDLE_CUDA_NUM_THREADS; - const int out_size = out->numel(); - int grid = (out_size + block - 1) / block; - - if (data_format == "NCDHW") { - if (mode == "reflect") { - Pad3DReflectNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "replicate") { - Pad3DReplicateNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "circular") { - Pad3DCircularNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else { - Pad3DConstNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data); - } - } else { - if (mode == "reflect") { - Pad3DReflectNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "replicate") { - Pad3DReplicateNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "circular") { - Pad3DCircularNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else { - Pad3DConstNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data); - } - } - } -}; - -template -class Pad3dGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_in = context.Output(framework::GradVarName("X")); - auto d_in_dims = d_in->dims(); - auto d_out_dims = d_out->dims(); - const T* d_out_data = d_out->data(); - T* d_in_data = d_in->mutable_data(context.GetPlace()); - - phi::funcs::SetConstant set_zero; - set_zero(context.template device_context(), - d_in, static_cast(0)); - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - - const int num = d_in_dims[0]; - - auto stream = context.cuda_device_context().stream(); - int block = PADDLE_CUDA_NUM_THREADS; - const int out_size = d_out->numel(); - const int in_size = d_in->numel(); - int grid = (out_size + block - 1) / block; - - if (data_format == "NCDHW") { - const int channels = d_in_dims[1]; - const int in_depth = d_in_dims[2]; - const int in_height = d_in_dims[3]; - const int in_width = d_in_dims[4]; - const int out_depth = d_out_dims[2]; - const int out_height = d_out_dims[3]; - const int out_width = d_out_dims[4]; - - if (mode == "reflect") { - Pad3DGradReflectNCDHW<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "replicate") { - Pad3DGradReplicateNCDHW<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "circular") { - Pad3DGradCircularNCDHW<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else { - grid = (in_size + block - 1) / block; - Pad3DGradConstNCDHW<<>>( - in_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } - } else { - const int channels = d_in_dims[4]; - const int in_depth = d_in_dims[1]; - const int in_height = d_in_dims[2]; - const int in_width = d_in_dims[3]; - const int out_depth = d_out_dims[1]; - const int out_height = d_out_dims[2]; - const int out_width = d_out_dims[3]; - if (mode == "reflect") { - Pad3DGradReflectNDHWC<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "replicate") { - Pad3DGradReplicateNDHWC<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "circular") { - Pad3DGradCircularNDHWC<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else { - grid = (in_size + block - 1) / block; - Pad3DGradConstNDHWC<<>>( - in_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel, - ops::Pad3dCUDAKernel, - ops::Pad3dCUDAKernel, ops::Pad3dCUDAKernel, - ops::Pad3dCUDAKernel); -REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel, - ops::Pad3dGradCUDAKernel, - ops::Pad3dGradCUDAKernel); diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc index 0a5d54e72c8454b46e63c2efc9bd79fad822f721..83a21a919dcaaf6341bc13c2503f0c772c9ec6f6 100644 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc +++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -56,22 +59,12 @@ class FrobeniusNormOpMaker : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce frobenius_norm"; } }; +DECLARE_INFER_SHAPE_FUNCTOR(frobenius_norm, FrobeniusNormInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + REGISTER_OPERATOR(frobenius_norm, ops::ReduceOp, FrobeniusNormOpMaker, ops::FrobeniusNormOpGradMaker, - ops::FrobeniusNormOpGradMaker); + ops::FrobeniusNormOpGradMaker, + FrobeniusNormInferShapeFunctor); REGISTER_OPERATOR(frobenius_norm_grad, ops::ReduceGradOp); - -REGISTER_OP_CPU_KERNEL(frobenius_norm, - ops::ReduceKernel, - ops::ReduceKernel); - -template -using CPUFrobeniusNormGradKernel = - ops::FrobeniusNormGradKernel; - -REGISTER_OP_CPU_KERNEL(frobenius_norm_grad, CPUFrobeniusNormGradKernel, - CPUFrobeniusNormGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu deleted file mode 100644 index b2cef09df94368d17171d5fb79fbc5e6ad332fe1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -template -using CUDAFrobeniusNormKernel = - ops::ReduceKernel; - -REGISTER_OP_CUDA_KERNEL(frobenius_norm, CUDAFrobeniusNormKernel, - CUDAFrobeniusNormKernel); - -template -using CUDAFrobeniusNormGradKernel = - ops::ReduceGradKernel; - -REGISTER_OP_CUDA_KERNEL(frobenius_norm_grad, CUDAFrobeniusNormGradKernel, - CUDAFrobeniusNormGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h deleted file mode 100644 index 0b6b87d99ecd98e65c492fb96f3a1e886b7bfa4b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -namespace paddle { -namespace operators { - -// \partial \| X \|_F = \frac{X}{ \| X \|_F } -template -class FrobeniusNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // default use Eigen broadcast - ReduceGradKernel kernel; - kernel.Compute(context); - } -}; - -struct FrobeniusNormFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = ((x->square()).sum(dim)).sqrt(); - } -}; - -struct FrobeniusNormGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = y->broadcast(dim); - dx->device(place) = *dx + dx->constant(1e-12f); - dx->device(place) = (*x / *dx) * (dy->broadcast(dim)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 04c8a329e5e1a3cc7177a09d592d46ba3ac700ec..de09860fd26d54894a7917502c4ac569dd5fdd4a 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -117,7 +117,7 @@ endif() cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # seperate init from device_context to avoid cycle dependencies -cc_library(init SRCS init.cc DEPS device_context custom_kernel) +cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt index acf914c5087d0ff11cda2d663a490e84a8c33216..42c949f7fe0f673e932c67313768e8e898992814 100644 --- a/paddle/fluid/platform/device/ipu/CMakeLists.txt +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -13,7 +13,7 @@ IF(WITH_IPU) "ipu_device.cc" ) - cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper) + cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist) cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce) add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC}) add_dependencies(paddle_ipu ipu_backend) diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc index e0b3b08a2313d0ba80e807494eb74612caf81fd5..012294d0fff8565a2e7ea85fc8cfe6b170fd5e8d 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.cc +++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc @@ -32,6 +32,7 @@ IpuBackend* IpuBackend::GetInstance() { IpuBackend::IpuBackend() { compiler_ = std::make_unique(); executor_ = std::make_unique(); + timer_ = std::make_unique(); } IpuBackend::~IpuBackend() { @@ -43,6 +44,7 @@ void IpuBackend::Compile(Graph* graph, const std::vector& feed_list, const std::vector& fetch_list) { VLOG(10) << "enter IpuBackend::Compile"; + is_compiled_ = false; compiler_->Prepare(graph); compiler_->InitInputs(feed_list); compiler_->LowerConstants(scope_); @@ -52,31 +54,25 @@ void IpuBackend::Compile(Graph* graph, if (ipu_strategy_->is_training) { compiler_->LowerOptimizer(scope_); } + if (!ipu_strategy_->onnx_dump_path.empty()) { + SaveModelProto(ipu_strategy_->onnx_dump_path); + } executor_->SetCompilerResources(compiler_->GetResources()); - + executor_->Prepare(compiler_->GetModelProto()); is_compiled_ = true; - // when call compile, means a new graph - is_prepared_ = false; VLOG(10) << "leave IpuBackend::Compile"; } void IpuBackend::Run(const std::vector& inputs, const std::vector& outputs, const framework::ExecutionContext& ctx) { - Prepare(); timer_->Start(); executor_->Run(inputs, outputs, ctx); timer_->Pause(); VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)"; } -void IpuBackend::Prepare() { - if (!is_prepared_) { - executor_->Prepare(compiler_->GetModelProto()); - timer_.reset(new platform::Timer()); - is_prepared_ = true; - } -} +void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); } void IpuBackend::Detach() { executor_->Detach(); } @@ -101,12 +97,10 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) { } void IpuBackend::SaveModelProto(const std::string& path) { - if (ipu_strategy_->is_training && is_prepared_) { + if (ipu_strategy_->is_training && is_compiled_) { executor_->SaveModelToHost(path); - } else if (is_compiled_) { - compiler_->SaveModelProtoNoCheck(path); } else { - LOG(WARNING) << "Model is empty"; + compiler_->SaveModelProtoNoCheck(path); } } diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h index 1244192490c16c4cfb01ac1c5f195cc123c4ba16..0578d9face675ab2754eed213b1044ed8b2cd707 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.h +++ b/paddle/fluid/platform/device/ipu/ipu_backend.h @@ -60,6 +60,9 @@ class IpuBackend { const std::vector &outputs, const framework::ExecutionContext &ctx); + // Sync weights from IPU while training + void WeightsToHost(); + // detach IPU manually void Detach(); @@ -76,22 +79,17 @@ class IpuBackend { void SaveModelProto(const std::string &path); private: - void Prepare(); - - private: - std::unique_ptr compiler_; - std::unique_ptr executor_; - bool is_compiled_ = false; - bool is_prepared_ = false; - // not own const Scope *scope_ = nullptr; const IpuStrategy *ipu_strategy_ = nullptr; - private: - // time record for IpuBackend::Run + // own + std::unique_ptr compiler_; + std::unique_ptr executor_; std::unique_ptr timer_; + bool is_compiled_ = false; + DISABLE_COPY_AND_ASSIGN(IpuBackend); }; diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index cdb3f6f9b3e285728d5c372b51492e42027aadba..1a3e600058b3b1d55082544af79cdb945cc0f50e 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/device/ipu/ipu_utils.h" @@ -25,13 +26,20 @@ namespace paddle { namespace platform { namespace ipu { -popart::AdamMode AdamModeFromStr(const std::string& str) { +popart::AdamMode AdamModeFromStr(const std::string& str, + const bool& use_no_bias_optimizer) { if (str == "adam") { - return popart::AdamMode::Adam; + if (!use_no_bias_optimizer) + return popart::AdamMode::Adam; + else + return popart::AdamMode::AdamNoBias; } else if (str == "adamax") { return popart::AdamMode::AdaMax; } else if (str == "lamb") { - return popart::AdamMode::Lamb; + if (!use_no_bias_optimizer) + return popart::AdamMode::Lamb; + else + return popart::AdamMode::LambNoBias; } else { PADDLE_THROW(platform::errors::InvalidArgument( "Uknown AdamMode: %s, AdamMode must be one of these values: adam, " @@ -70,6 +78,17 @@ popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) { } } +popart::DataType DataTypeFromStr(const std::string& str) { + if (str == "FLOAT") { + return popart::DataType::FLOAT; + } else if (str == "FLOAT16") { + return popart::DataType::FLOAT16; + } else { + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported DataType: %s", str)); + } +} + template T GetAttrAllowNull(std::string attr, OpDesc* op_desc) { if (op_desc->HasAttr(attr)) { @@ -122,6 +141,17 @@ void Compiler::Prepare(const Graph* graph) { builder_ = popart::Builder::create(); resources_ = std::make_unique(); graph_helper_ = std::make_unique(graph); + // Set the flag of set_amp_for_all_ + for (auto* node : graph_helper_->sorted_ops) { + auto* op_desc = node->Op(); + auto op_type = op_desc->Type(); + if (op_type == "popart_matmul") { + if (op_desc->HasAttr(sAvailMemAttribute)) { + set_amp_for_all_ = false; + return; + } + } + } } void Compiler::RegisterOpFunc() { @@ -155,7 +185,9 @@ void Compiler::RegisterOpFunc() { auto debug_context = BuildDebugContext(op_desc); \ auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1(); \ auto aiOnnxOpset = builder_->aiOnnxOpset11(); \ + PushNameScope(op_desc); \ auto output_ids = OnnxImpl(inputs Args, debug_context); \ + PopNameScope(op_desc); \ SetIpuIndexStage(output_ids, op_desc); \ SetAMPAttributes(output_ids, op_desc); \ SetSerializeAttributes(output_ids, op_desc); \ @@ -241,7 +273,9 @@ void Compiler::LowerConstants(const Scope* scope) { popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()), shape); const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info)); + PushNameScope(op_desc); popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data); + PopNameScope(op_desc); SetIpuIndexStage(result, op_desc); resources_->tensors.emplace(tensor_name, result); } @@ -261,6 +295,10 @@ void Compiler::LowerWeights(const Scope* scope) { VLOG(10) << "found existed one, skip lowering Weight: " << var_name; continue; } + if (var_name.rfind("learning_rate", 0) == 0) { + VLOG(10) << "skip learning_rate_var: " << var_name; + continue; + } VLOG(10) << "lowering weight: " << var_name; auto var = scope->FindVar(var_name); @@ -273,10 +311,15 @@ void Compiler::LowerWeights(const Scope* scope) { } popart::TensorInfo tensor_info(dtype, shape); popart::ConstVoidData const_data{tensor.data(), tensor_info}; - popart::TensorId result = - builder_->addInitializedInputTensor(const_data, var_name); - resources_->tensors.emplace(var_name, result); - resources_->weights.push_back(result); + if (!node->outputs.empty()) { + auto op_node = node->outputs[0]; + PushNameScope(op_node->Op()); + popart::TensorId result = + builder_->addInitializedInputTensor(const_data, var_name); + PopNameScope(op_node->Op()); + resources_->tensors.emplace(var_name, result); + resources_->weights.push_back(var_name); + } } } } @@ -298,7 +341,10 @@ void Compiler::LowerBody() { } else if (op_type == "popart_checkpointoutput") { auto inputs = GetOpInputs(op_desc); auto outputs = GetOpOutputs(op_desc); + PushNameScope(op_desc); auto output_ids = builder_->checkpointOutput(inputs); + PopNameScope(op_desc); + SetIpuIndexStage(output_ids, op_desc); InsertTensors(outputs, output_ids); } else if (op_type == "popart_custom_op") { auto inputs = GetOpInputs(op_desc); @@ -313,9 +359,11 @@ void Compiler::LowerBody() { BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type")); VLOG(10) << "Build graph from custom op: " << __op_type; auto it = custom_ops_.find(__op_type); + PushNameScope(op_desc); auto output_ids = builder_->customOp(it->second.popart_op, it->second.popart_op.version, inputs, outputs.size(), attributes, debug_context); + PopNameScope(op_desc); SetIpuIndexStage(output_ids, op_desc); InsertTensors(outputs, output_ids); } else if (op_type == "popart_printtensor") { @@ -325,8 +373,10 @@ void Compiler::LowerBody() { auto print_gradient = BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient")); auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title")); + PushNameScope(op_desc); auto output_ids = builder_->aiGraphcoreOpset1().printtensor( inputs, print_gradient, debug_context, title); + PopNameScope(op_desc); SetIpuIndexStage(output_ids, op_desc); InsertTensors(outputs, output_ids); } else { @@ -367,8 +417,31 @@ void Compiler::LowerOptimizer(const Scope* scope) { resources_->with_lr_sched = false; } VLOG(10) << "Set initial lr: " << resources_->lr; - auto loss_scaling = ipu_strategy_->loss_scaling; + + // Get the type of optimizer auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type")); + // Set weight decay by tensor names for Lamb + auto weight_decay_vars = BOOST_GET_CONST( + std::vector, op_desc->GetAttr("weight_decay_vars")); + auto weight_decay_values = BOOST_GET_CONST( + std::vector, op_desc->GetAttr("weight_decay_values")); + // Get the maximum permissible value for gradient clipping + std::vector clip_norm_settings = {}; + if (op_desc->HasAttr("clip_norm")) { + auto clip_norm = BOOST_GET_CONST(float, op_desc->GetAttr("clip_norm")); + clip_norm_settings.push_back( + popart::ClipNormSettings::clipAllWeights(clip_norm)); + VLOG(10) << "Set the global gradient clipping with the maximum " + "permissible value: " + << clip_norm; + } + + // Values from ipu_strategy + auto loss_scaling = ipu_strategy_->loss_scaling; + auto accl1_type = DataTypeFromStr(ipu_strategy_->accl1_type); + auto accl2_type = DataTypeFromStr(ipu_strategy_->accl2_type); + auto accl3_type = DataTypeFromStr(ipu_strategy_->accl3_type); + if (type == "sgd") { auto weight_decay = BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); @@ -376,12 +449,18 @@ void Compiler::LowerOptimizer(const Scope* scope) { resources_->optimizer_fn = [=](float lr) { return std::make_unique( popart::OptimizerValue(lr, false), - popart::OptimizerValue(weight_decay, true), + popart::OptimizerValue(weight_decay, false), popart::OptimizerValue(momentum, true), popart::SGD::getUnsetDampening(), popart::SGD::getUnsetVelocityScaling(), - popart::OptimizerValue(loss_scaling, true)); + popart::OptimizerValue(loss_scaling, true), clip_norm_settings); }; + resources_->eval_optimizer = std::make_unique( + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(), + popart::SGD::getUnsetVelocityScaling(), + popart::OptimizerValue(loss_scaling, true), clip_norm_settings); } else if (type == "adam") { auto weight_decay = BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); @@ -392,22 +471,79 @@ void Compiler::LowerOptimizer(const Scope* scope) { VLOG(10) << "set max_weight_norm: " << mwn; auto adam_mode_ = BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode")); - auto adam_mode = AdamModeFromStr(adam_mode_); - auto weight_decay_mode_ = - BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); + auto adam_mode = + AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer); + auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode; + if (weight_decay_mode_.empty()) { + weight_decay_mode_ = BOOST_GET_CONST( + std::string, op_desc->GetAttr("weight_decay_mode")); + } auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); resources_->optimizer_fn = [=](float lr) { - return std::make_unique( - popart::OptimizerValue(lr, false), - popart::OptimizerValue(weight_decay, true), - popart::OptimizerValue(beta1, true), - popart::OptimizerValue(beta2, true), + if (adam_mode == popart::AdamMode::Lamb || + adam_mode == popart::AdamMode::LambNoBias) { + const std::map> + optimizer_value = {{"defaultLearningRate", {lr, false}}, + {"defaultBeta1", {beta1, false}}, + {"defaultBeta2", {beta2, false}}, + {"defaultEps", {eps, true}}, + {"lossScaling", {loss_scaling, true}}, + {"defaultMaxWeightNorm", {mwn, true}}}; + auto optimizer_instance = std::make_unique( + optimizer_value, adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, accl1_type, accl2_type, + clip_norm_settings); + for (int i = 0; i < weight_decay_vars.size(); i++) { + optimizer_instance->insertSpecific( + weight_decay_vars[i], + {{"weightDecay", {weight_decay_values[i], false}}}); + VLOG(10) << "Set Tensor " << weight_decay_vars[i] + << " weight decay as " << weight_decay_values[i]; + } + return optimizer_instance; + } else { + return std::make_unique( + popart::OptimizerValue(lr, false), + popart::OptimizerValue(weight_decay, false), + popart::OptimizerValue(beta1, false), + popart::OptimizerValue(beta2, false), + popart::OptimizerValue(eps, true), + popart::OptimizerValue(loss_scaling, true), + popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, accl1_type, accl2_type, + clip_norm_settings); + } + }; + if (adam_mode == popart::AdamMode::Lamb || + adam_mode == popart::AdamMode::LambNoBias) { + const std::map> optimizer_value = + {{"defaultLearningRate", {0.0, false}}, + {"defaultBeta1", {beta1, false}}, + {"defaultBeta2", {beta2, false}}, + {"defaultEps", {eps, true}}, + {"lossScaling", {loss_scaling, true}}, + {"defaultMaxWeightNorm", {mwn, true}}}; + auto eval_optimizer = std::make_unique( + optimizer_value, adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, popart::DataType::FLOAT, + popart::DataType::FLOAT, clip_norm_settings); + for (int i = 0; i < weight_decay_vars.size(); i++) { + eval_optimizer->insertSpecific(weight_decay_vars[i], + {{"weightDecay", {0.0, false}}}); + } + resources_->eval_optimizer = std::move(eval_optimizer); + } else { + resources_->eval_optimizer = std::make_unique( + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(beta1, false), + popart::OptimizerValue(beta2, false), popart::OptimizerValue(eps, true), popart::OptimizerValue(loss_scaling, true), popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT); - }; + popart::DataType::FLOAT, clip_norm_settings); + } } else if (type == "adaptive") { auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha")); auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum")); @@ -417,21 +553,33 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto adaptive_mode_ = BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode")); auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_); - auto weight_decay_mode_ = - BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); + auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode; + if (weight_decay_mode_.empty()) { + weight_decay_mode_ = BOOST_GET_CONST( + std::string, op_desc->GetAttr("weight_decay_mode")); + } auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); resources_->optimizer_fn = [=](float lr) { return std::make_unique( popart::OptimizerValue(lr, false), - popart::OptimizerValue(weight_decay, true), + popart::OptimizerValue(weight_decay, false), popart::OptimizerValue(alpha, true), popart::OptimizerValue(momentum, true), popart::OptimizerValue(eps, true), popart::OptimizerValue(loss_scaling, true), adaptive_mode, - weight_decay_mode, popart::DataType::UNDEFINED, - popart::DataType::FLOAT, popart::DataType::FLOAT, - popart::DataType::FLOAT); + weight_decay_mode, popart::DataType::UNDEFINED, accl1_type, + accl2_type, accl3_type); }; + resources_->eval_optimizer = std::make_unique( + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(alpha, true), + popart::OptimizerValue(momentum, true), + popart::OptimizerValue(eps, true), + popart::OptimizerValue(loss_scaling, true), adaptive_mode, + weight_decay_mode, popart::DataType::UNDEFINED, + popart::DataType::FLOAT, popart::DataType::FLOAT, + popart::DataType::UNDEFINED); } else { PADDLE_THROW(platform::errors::Unimplemented( "optimizer %s is not implemented", type)); @@ -510,9 +658,32 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id, const OpDesc* op_desc) { VLOG(10) << "enter Compiler::SetAMPAttributes"; if (op_desc->Type() == "popart_matmul") { - auto amp = ipu_strategy_->available_memory_proportion; - if (amp > 0.0f && amp <= 1.0) { - builder_->setAvailableMemoryProportion(tensor_id, amp); + if (set_amp_for_all_) { + auto amp = ipu_strategy_->available_memory_proportion; + if (amp < 0.0f || amp > 1.0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "AvailableMemoryProportion %f is invalid, which should be set 0 <= " + "amp <= 1", + amp)); + } + if (amp > 0.0f) { + builder_->setAvailableMemoryProportion(tensor_id, amp); + } + } else { + if (op_desc->HasAttr(sAvailMemAttribute)) { + auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute)); + if (amp < 0.0f || amp > 1.0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "AvailableMemoryProportion %f is invalid, which should be set 0 " + "<= amp <= 1", + amp)); + } + if (amp > 0.0f) { + builder_->setAvailableMemoryProportion(tensor_id, amp); + VLOG(10) << "set available_memory_proportion for tensor: " + << tensor_id << " as " << amp; + } + } } } VLOG(10) << "leave Compiler::SetAMPAttributes"; @@ -602,6 +773,29 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) { return popart::DebugContext(op_identify_id); } +void Compiler::PushNameScope(const OpDesc* op) { + auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope)); + if (op_namescope == "/") { + return; + } + if (!op_namescope.empty()) { + op_namescope.pop_back(); + } + if (!op_namescope.empty()) { + op_namescope.erase(op_namescope.begin()); + } + VLOG(10) << "name_scope is: " << op_namescope; + builder_->pushNameScope(op_namescope); +} + +void Compiler::PopNameScope(const OpDesc* op) { + auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope)); + if (op_namescope == "/") { + return; + } + builder_->popNameScope(); +} + } // namespace ipu } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h index 5d1e8c2727d8f9ca36c9380584505dbfcabfb064..2d00970bf129750af34dcc9e2239409cd12d897e 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.h +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h @@ -50,6 +50,8 @@ struct CompilerResources { using OptimizerFn = std::function(float lr)>; OptimizerFn optimizer_fn; + // The eval mode of optimizer in training + std::unique_ptr eval_optimizer; public: popart::Optimizer *Optimizer() { return optimizer.get(); } @@ -110,6 +112,7 @@ class Compiler { void RegisterOpFunc(); std::vector GetOpInputs(const OpDesc *op); const std::vector &GetOpOutputs(const OpDesc *op); + const std::string GetNameScope(const OpDesc *op); popart::DebugContext BuildDebugContext(const OpDesc *op); void InsertTensors(const std::vector &output_names, @@ -126,6 +129,8 @@ class Compiler { const OpDesc *op_desc); void SetSerializeAttributes(const std::string &tensor_id, const OpDesc *op_desc); + void PushNameScope(const OpDesc *op); + void PopNameScope(const OpDesc *op); private: std::unique_ptr builder_; @@ -137,6 +142,14 @@ class Compiler { const IpuStrategy *ipu_strategy_ = nullptr; std::map custom_ops_; + + // Used to choose the way to set amp for Ops + // If anyone op has the attr sAvailMemAttribute, the + // available_memory_proportion from ipu_strategy + // will be ignored and the Ops are set by their own sAvailMemAttribute. Else, + // all relevant Ops will be set by + // the available_memory_proportion from ipu_strategy. + bool set_amp_for_all_ = true; }; } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index c124d58957fe642365bd5bbf074bc15bfd74c6ba..649b291244110e69c364dc50d7840d47040e9ab0 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -64,15 +64,10 @@ void Executor::Prepare(const std::string &proto) { WeightsFromPaddle(); VLOG(10) << "Copy weights from paddle to popart...done"; - VLOG(10) << "Copy weights from host to device..."; - session_->weightsFromHost(); - VLOG(10) << "Copy weights from host to device...done"; - - if (ipu_strategy_->save_init_onnx) { - session_->modelToHost("test_init.onnx"); + if (ipu_strategy_->random_seed != std::numeric_limits::max()) { + VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed; + session_->setRandomSeed(ipu_strategy_->random_seed); } - // init run step - step_ = 0; } void Executor::Run(const std::vector &inputs, @@ -120,11 +115,17 @@ void Executor::Run(const std::vector &inputs, VLOG(10) << "Prepared inputs/anchors"; if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) { - VLOG(10) << "Update learning_rate"; - auto new_lr = - GetSingleVarFromScope(scope_, compiler_resources_->lr_var); - VLOG(10) << "New Lr: " << new_lr; - auto *optimizer = compiler_resources_->UpdateOptimizer(new_lr); + popart::Optimizer *optimizer; + if (ipu_strategy_->runtime_options.enable_eval) { + VLOG(10) << "Switch optimizer to eval mode"; + optimizer = compiler_resources_->eval_optimizer.get(); + } else { + VLOG(10) << "Update learning_rate"; + auto new_lr = + GetSingleVarFromScope(scope_, compiler_resources_->lr_var); + VLOG(10) << "New Lr: " << new_lr; + optimizer = compiler_resources_->UpdateOptimizer(new_lr); + } auto *session = dynamic_cast(session_.get()); session->updateOptimizerFromHost(optimizer); } @@ -133,15 +134,13 @@ void Executor::Run(const std::vector &inputs, VLOG(10) << "Running..."; session_->run(stepio); VLOG(10) << "Running...done"; +} - step_++; - if (ipu_strategy_->is_training && - step_ % ipu_strategy_->save_per_n_step == 0) { - session_->weightsToHost(); +void Executor::WeightsToHost() { + if (ipu_strategy_->is_training && session_) { WeightsToPaddle(); - if (ipu_strategy_->save_onnx_checkpoint) { - session_->modelToHost("test_last" + std::to_string(step_) + ".onnx"); - } + } else { + LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU."; } } @@ -153,6 +152,7 @@ void Executor::AcquireDevice() { } bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + bool enable_distribution = ipu_strategy_->enable_distribution; if (use_ipu_model) { std::map deviceOpts{ { @@ -162,6 +162,16 @@ void Executor::AcquireDevice() { }; device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice( deviceOpts); + } else if (enable_distribution) { + auto ipus_per_replica = ipu_strategy_->num_ipus / + ipu_strategy_->popart_options.replicatedGraphCount; + auto device_id = popdist_get_device(ipus_per_replica); + device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById( + device_id); + PADDLE_ENFORCE_NOT_NULL( + device_, platform::errors::Unavailable( + "Can't attach IPU in distribution, ipu_num = %d.", + RequestIpus(ipu_strategy_->num_ipus))); } else { device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice( @@ -185,28 +195,29 @@ void Executor::SetWeightsIO() { auto opt_type = compiler_resources_->optimizer_type; VLOG(10) << "SetWeightsIO for " << opt_type; auto pre_post_fix = GetOptPrePostfix(opt_type); - for (const auto &weight_id : compiler_resources_->weights) { + for (const auto &weight_pd : compiler_resources_->weights) { for (const auto &pair : pre_post_fix) { // pair.first : popart prefix, pair.second : paddle postfix - auto popart_var_name = pair.first + weight_id; - auto paddle_var_name = weight_id + pair.second; + auto weight_pop = compiler_resources_->tensors[weight_pd]; + auto popart_var = pair.first + weight_pop; + auto paddle_var = weight_pd + pair.second; - if (scope_->FindVar(paddle_var_name) == nullptr) { + if (scope_->FindVar(paddle_var) == nullptr) { continue; } - - if (!session_->hasInfo(popart_var_name)) { + if (!session_->hasInfo(popart_var)) { continue; } - auto var = scope_->GetVar(paddle_var_name); + VLOG(10) << "Connect paddle weight: " << paddle_var + << " with popart weight: " << popart_var; + auto var = scope_->GetVar(paddle_var); auto data_ptr = var->GetMutable()->data(); - - auto tensor_info = session_->getInfo(popart_var_name); - executor_resources_->weights_io.insert(popart_var_name, + auto tensor_info = session_->getInfo(popart_var); + executor_resources_->weights_io.insert(popart_var, {data_ptr, tensor_info}); executor_resources_->weights_and_opt_state.emplace_back( - std::make_pair(popart_var_name, paddle_var_name)); + std::make_pair(popart_var, paddle_var)); } } } @@ -284,6 +295,7 @@ void Executor::ConvertWeights(bool align_to_popart) { void Executor::WeightsFromPaddle() { ConvertWeights(true); session_->writeWeights(executor_resources_->weights_io); + session_->weightsFromHost(); } // |-----------------------------------------------------| @@ -297,13 +309,13 @@ void Executor::WeightsFromPaddle() { // Paddle -> halfToFloat: cast then save to paddle // Popart -> Paddle: copy from paddle to popart void Executor::WeightsToPaddle() { + session_->weightsToHost(); session_->readWeights(executor_resources_->weights_io); ConvertWeights(false); } void Executor::SaveModelToHost(const std::string &path) { if (session_) { - session_->weightsToHost(); WeightsToPaddle(); session_->modelToHost(path); } else { diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h index b08b94b45ff65d9e04da0447f55801859a59bb1b..c59e623ab20b0215f3ecb8f0f6811e4cb0ee2997 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.h +++ b/paddle/fluid/platform/device/ipu/ipu_executor.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" @@ -36,8 +37,7 @@ struct ExecutorResources { // map popart::WeightsIO weights_io; // pairs, include weights and optimizer states - std::vector> - weights_and_opt_state; + std::vector> weights_and_opt_state; }; class Executor { @@ -53,14 +53,12 @@ class Executor { const std::vector &outputs, const framework::ExecutionContext &ctx); + // sync weights from popart to paddle + void WeightsToHost(); + // detach IPU void Detach(); - void SetWeightsIO(); - void ConvertWeights(bool align_to_popart); - void WeightsFromPaddle(); - void WeightsToPaddle(); - // Scope void SetScope(const Scope *scope) { scope_ = scope; } @@ -79,6 +77,10 @@ class Executor { private: void AcquireDevice(); + void SetWeightsIO(); + void ConvertWeights(bool); + void WeightsFromPaddle(); + void WeightsToPaddle(); private: // not own @@ -92,8 +94,6 @@ class Executor { std::unique_ptr session_; // one OneSession means a graph std::unique_ptr executor_resources_; - - int step_ = 0; }; } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_names.h b/paddle/fluid/platform/device/ipu/ipu_names.h index a809a8c6e5bcc30fca2adde99c87581feb8a79a4..b8a6ceffb5c153c9fd9c1f8ffa2b849553da38af 100644 --- a/paddle/fluid/platform/device/ipu/ipu_names.h +++ b/paddle/fluid/platform/device/ipu/ipu_names.h @@ -24,6 +24,8 @@ static constexpr const char *sIpuIndexAttr = "ipu_index"; static constexpr const char *sIpuStageAttr = "ipu_stage"; static constexpr const char *sMatmulSerializeFactor = "serialize_factor"; static constexpr const char *sMatmulSerializeMode = "serialize_mode"; +static constexpr const char *sAvailMemAttribute = "__available_memory"; +static constexpr const char *sOpNamescope = "op_namescope"; static constexpr const char *sOpIdentifyIdAttr = "op_identify_id"; static constexpr const char *sDebugInfoId = "__debug_info_id"; diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index e806b0b30e4e03759847cc2e1838171020a064b1..6172d4d7dc6800d3f48d88357ab8feb36c00e463 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -62,23 +62,40 @@ IpuStrategy::IpuStrategy() { [&]() { return name; }) ADD_BOOL_OPTION(is_training); - ADD_BOOL_OPTION(save_init_onnx); - ADD_BOOL_OPTION(save_onnx_checkpoint); ADD_BOOL_OPTION(need_avg_shard); ADD_BOOL_OPTION(enable_fp16); + ADD_BOOL_OPTION(transfer_cast_op); + ADD_BOOL_OPTION(use_no_bias_optimizer); + ADD_BOOL_OPTION(enable_distribution); ADD_UINT64_OPTION(num_ipus); ADD_UINT64_OPTION(batches_per_step); ADD_UINT64_OPTION(micro_batch_size); - ADD_UINT64_OPTION(save_per_n_step); + ADD_UINT64_OPTION(random_seed); ADD_DOUBLE_OPTION(available_memory_proportion); ADD_DOUBLE_OPTION(loss_scaling); ADD_DOUBLE_OPTION(max_weight_norm); + ADD_STRING_OPTION(accl1_type); + ADD_STRING_OPTION(accl2_type); + ADD_STRING_OPTION(accl3_type); + ADD_STRING_OPTION(onnx_dump_path); + ADD_STRING_OPTION(weight_decay_mode); #undef ADD_STRING_OPTION #undef ADD_DOUBLE_OPTION #undef ADD_UINT64_OPTION #undef ADD_BOOL_OPTION +#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name) \ + RegisterSetter(bool_options, #name, \ + [&](bool value) { runtime_options.aliased_name = value; }); \ + RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \ + return std::to_string(runtime_options.aliased_name); \ + }) + + ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval); + +#undef ADD_RUNTIME_BOOL_OPTION + #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType) \ RegisterSetter(uint64_options, #name, [&](std::uint64_t value) { \ PADDLE_ENFORCE_LT( \ @@ -171,6 +188,7 @@ IpuStrategy::IpuStrategy() { ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold, mergeVarUpdateMemThreshold); ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak); + ADD_POPART_UINT64_OPTION_ALIAS(replicated_graph_count, replicatedGraphCount); ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor); ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler); ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor, @@ -462,12 +480,30 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor, } else if (opt == "use_io_tiles_to_store") { settings->location.storageTileSet = value > 0 ? popart::TileSet::IO : popart::TileSet::Compute; + } else if (opt == "sharding_domain_with_all") { + settings->location.shardingDomain = + popart::CommGroup(popart::CommGroupType::All, value); + } else if (opt == "sharding_domain_with_consecutive") { + settings->location.shardingDomain = + popart::CommGroup(popart::CommGroupType::Consecutive, value); + } else if (opt == "sharding_domain_with_orthogonal") { + settings->location.shardingDomain = + popart::CommGroup(popart::CommGroupType::Orthogonal, value); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unknown option ' %s' for tensor location: %s", opt, tensor)); } } +void IpuStrategy::SetAccumulateOuterFragmentSettings( + const std::uint64_t& schedule, const std::vector& values) { + VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule; + auto schedule_ = + static_cast(schedule); + popart_options.accumulateOuterFragmentSettings = + popart::AccumulateOuterFragmentSettings(schedule_, values); +} + void IpuStrategy::AddCustomOp(const std::string& paddle_op, const std::string& popart_op, const std::string& domain, int version) { diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 571fb1e163718388a779e128fb6aaf76659d7183..786e2419cc0be91394157edb26dd9c36d9c2a67c 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -24,6 +24,11 @@ namespace paddle { namespace platform { namespace ipu { +struct RuntimeOptions { + // enable the eval mode in training by switching optimizers. + bool enable_eval = false; +}; + class IpuStrategy { public: IpuStrategy(); @@ -32,19 +37,24 @@ class IpuStrategy { // training flag, true for training bool is_training = true; - // save the onnx model lowered by paddle program description - bool save_init_onnx = false; - - // save the trained model - bool save_onnx_checkpoint = false; - // average sharding, debugging used bool need_avg_shard = false; // flag for fp16, true for pure fp16 bool enable_fp16 = false; - // Number ipus total needed, replica * ipu_per_replica + // enable transfer cast Op target from fp32 to fp16 in fp16 mode + bool transfer_cast_op = true; + + // The mode of Adam/Lamb optimizer + // false: The standard Adam/Lamb optimizer + // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART + bool use_no_bias_optimizer = false; + + // enable distributed computing for POD128 or POD256 + bool enable_distribution = false; + + // Number ipus total needed, local_replica * ipu_per_replica int num_ipus = 1; // batches per step @@ -53,8 +63,8 @@ class IpuStrategy { // micro batch-size int micro_batch_size = 1; - // save paddle model per n steps - int save_per_n_step = 1; + // random seed + std::uint64_t random_seed = std::numeric_limits::max(); // TODO(alleng) remove this param // available memory proportion, 0.0f for disable @@ -67,6 +77,29 @@ class IpuStrategy { // defaultMaxWeightNorm for adam optimizer float max_weight_norm = 65504.0f; + // file path for dumping compiled model in onnx format + std::string onnx_dump_path; + + // Data type to use for tensor that stores first-order momentum optimizer + // state. FLOAT or FLOAT16 + std::string accl1_type = "FLOAT"; + + // Data type to use for tensor that stores second-order momentum optimizer + // state. FLOAT or FLOAT16 + std::string accl2_type = "FLOAT"; + + // Data type to use for tensor that stores third-order momentum optimizer + // state. FLOAT or FLOAT16 + std::string accl3_type = "FLOAT"; + + // WeightDecayMode for setting the optimizer + // if set, it will override other settings + // value must be one of "decay" or "l2_regularization" or not set + std::string weight_decay_mode = ""; + + // Runtime Options + RuntimeOptions runtime_options; + // popart session option popart::SessionOptions popart_options; @@ -86,6 +119,8 @@ class IpuStrategy { const std::string &value); void SetTensorLocation(const std::string &tensor, const std::string &option, std::uint64_t value); + void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule, + const std::vector &values); void AddCustomOp(const std::string &paddle_op, const std::string &popart_op, const std::string &domain, int version); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc index c980bb780cfc0b9088fca71131d1b014b35d483c..7d92835534513c7937d7cbe2f19e367550e751d7 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc @@ -34,15 +34,36 @@ Node *logical_not_handler(Graph *graph, Node *node) { {GetOutputVarNode("Out", node)}, {}); } +Node *logical_or_handler(Graph *graph, Node *node) { + return CreateBaseOp(graph, node, "popart_logical_or", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + {GetOutputVarNode("Out", node)}, {}); +} + +Node *logical_and_handler(Graph *graph, Node *node) { + return CreateBaseOp(graph, node, "popart_logical_and", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + {GetOutputVarNode("Out", node)}, {}); +} + Node *greater_than_handler(Graph *graph, Node *node) { return CreateBaseOp(graph, node, "popart_greater", {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, {GetOutputVarNode("Out", node)}, {}); } +Node *less_than_handler(Graph *graph, Node *node) { + return CreateBaseOp(graph, node, "popart_less", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + {GetOutputVarNode("Out", node)}, {}); +} + REGISTER_HANDLER(equal, equal_handler); REGISTER_HANDLER(logical_not, logical_not_handler); +REGISTER_HANDLER(logical_or, logical_or_handler); +REGISTER_HANDLER(logical_and, logical_and_handler); REGISTER_HANDLER(greater_than, greater_than_handler); +REGISTER_HANDLER(less_than, less_than_handler); } // namespace } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc index d4a14a6d8409f9b50247f747016f5284f11037da..ba6675f40f400531896461e5fa1ce2e9bece72af 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc @@ -98,6 +98,12 @@ Node *matmul_handler(Graph *graph, Node *node) { if (x_rank == 1) { perm = std::vector{0}; } else if (x_rank == 2) { + if (!transpose_x && !transpose_y && is_float_equal(alpha, 1.0f)) { + return CreateBaseOp( + graph, node, "popart_matmul", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + node->outputs); + } return CreateGemm(graph, node, {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs, transpose_x, transpose_y, alpha); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc index 3ec1999edc4f0549cc348d9da39e3e23b76a1a91..0339097d587900a1a18e122b4b91eff21144c502 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc @@ -32,30 +32,10 @@ const std::string GenerateOpName() { const std::string CreateOpIdentifyId(Node *node) { // format: - // if has custom op_namescope: - // {op_namescope}/op_type/_gen_* - // else: - // {op_type}/{out_var0}/{out_var1}/.../_gen_* + // op_type/_gen_* // this name will be used as op name when exporting onnx model from popart auto op_type = node->Name(); - std::string op_namescope; - if (node->Op()->HasAttr("op_namescope")) { - op_namescope = - BOOST_GET_CONST(std::string, node->Op()->GetAttr("op_namescope")); - } else { - op_namescope = "/"; - } - - if (op_namescope != "/") { - return {op_namescope + op_type + "/" + GenerateOpName()}; - } else { - std::string op_out = ""; - for (auto *out_node : node->outputs) { - op_out += "/"; - op_out += out_node->Name(); - } - return {op_type + op_out + "/" + GenerateOpName()}; - } + return {op_type + "/" + GenerateOpName()}; } Node *MakeVarNode(Graph *graph, Node *node) { @@ -122,6 +102,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type, if (node->Op()->HasAttr(sMatmulSerializeMode)) { CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op()); } + if (node->Op()->HasAttr(sAvailMemAttribute)) { + CopyOpAttr(sAvailMemAttribute, node->Op(), new_node->Op()); + } + if (node->Op()->HasAttr(sOpNamescope)) { + CopyOpAttr(sOpNamescope, node->Op(), new_node->Op()); + } { new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node)); new_node->Op()->Flush(); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc index 0919afef4d83aee2ab0a17a7a09c23d3f550f233..8bd07943688380aaa32e2e94f81e3bb516be78f4 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc @@ -54,10 +54,36 @@ Node *checkpointoutput_handler(Graph *graph, Node *node) { node->outputs); } +Node *custom_nll_loss_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction")); + auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignoreIndex")); + auto inputIsLogProbability = + BOOST_GET_CONST(bool, op->GetAttr("inputIsLogProbability")); + return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs, + node->outputs, + {{"reduction", reduction}, + {"ignoreIndex", ignoreIndex}, + {"inputIsLogProbability", inputIsLogProbability}}); +} + +Node *identity_handler(Graph *graph, Node *node) { + return CreateBaseOp(graph, node, "popart_identity", node->inputs, + node->outputs); +} + +Node *detach_handler(Graph *graph, Node *node) { + return CreateBaseOp(graph, node, "popart_detach_v2", node->inputs, + node->outputs); +} + REGISTER_HANDLER(custom_op, custom_op_handler); REGISTER_HANDLER(print, print_handler); REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler); REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler); +REGISTER_HANDLER(custom_nll_loss, custom_nll_loss_handler); +REGISTER_HANDLER(identity, identity_handler); +REGISTER_HANDLER(detach, detach_handler); } // namespace } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc index db429d2f6228455bd4ca1a47d117ddf2ad286e65..6ccb5441f8375b92b9566ed5d72f1005d5c1ab8c 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc @@ -49,6 +49,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) { case framework::proto::VarType::INT64: value = std::vector(size, value_); break; + case framework::proto::VarType::BOOL: + value = std::vector(size, value_); + break; default: PADDLE_THROW( platform::errors::Unimplemented("fill_constant dtype: %d", dtype_)); @@ -417,6 +420,45 @@ Node *assign_handler(Graph *graph, Node *node) { {GetOutputVarNode("Out", node)}, {}); } +Node *assign_value_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype")); + auto dtype = VarType2OnnxDtype(dtype_); + auto dims_ = BOOST_GET_CONST(std::vector, op->GetAttr("shape")); + std::vector dims(dims_.begin(), dims_.end()); + Attribute values; + std::string value_name; + switch (dtype_) { + case framework::proto::VarType::BOOL: { + value_name = "bool_values"; + auto vec_int = BOOST_GET_CONST(std::vector, op->GetAttr(value_name)); + std::vector vec_bool(vec_int.begin(), vec_int.end()); + values = vec_bool; + } break; + case framework::proto::VarType::INT32: + value_name = "int32_values"; + values = BOOST_GET_CONST(std::vector, op->GetAttr(value_name)); + break; + case framework::proto::VarType::FP32: + value_name = "fp32_values"; + values = BOOST_GET_CONST(std::vector, op->GetAttr(value_name)); + break; + case framework::proto::VarType::INT64: + value_name = "int64_values"; + values = BOOST_GET_CONST(std::vector, op->GetAttr(value_name)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported data type(code %d) for AssignValue operator, only " + "supports bool, int32, float32 and int64.", + dtype)); + } + return CreateConst(graph, node, node->inputs, node->outputs, + AttributeMap{ + {"value", values}, {"dims", dims}, {"dtype", dtype}, + }); +} + Node *fill_any_like_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto value = BOOST_GET_CONST(float, op->GetAttr("value")); @@ -482,6 +524,41 @@ Node *one_hot_handler(Graph *graph, Node *node) { } } +Node *one_hot_v2_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto depth = BOOST_GET_CONST(int, op->GetAttr("depth")); + auto allow_out_of_range = + BOOST_GET_CONST(bool, op->GetAttr("allow_out_of_range")); + if (allow_out_of_range) { + PADDLE_THROW(platform::errors::Unimplemented( + "Do not support allow_out_of_range=True")); + } else { + auto depth_tensor = + CreateConst(graph, node, {}, {}, {{"value", std::vector{depth}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT32}}); + Node *value_tensor = nullptr; + if (GetOutputVarNode("Out", node)->Var()->GetDataType() == + framework::proto::VarType::FP16) { + value_tensor = + CreateConst(graph, node, {}, {}, {{"value", std::vector{0, 1}}, + {"dims", std::vector{2}}, + {"dtype", ONNXDataType::FLOAT16}}); + } else { + value_tensor = + CreateConst(graph, node, {}, {}, {{"value", std::vector{0, 1}}, + {"dims", std::vector{2}}, + {"dtype", ONNXDataType::FLOAT}}); + } + + return CreateBaseOp(graph, node, "popart_onehot", + {GetInputVarNode("X", node), depth_tensor->outputs[0], + value_tensor->outputs[0]}, + {GetOutputVarNode("Out", node)}, + {{"axis", int64_t{-1}}}); + } +} + Node *split_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); @@ -510,10 +587,12 @@ REGISTER_HANDLER(shape, shape_handler); REGISTER_HANDLER(slice, slice_handler); REGISTER_HANDLER(expand, expand_handler); REGISTER_HANDLER(assign, assign_handler); +REGISTER_HANDLER(assign_value, assign_value_handler); REGISTER_HANDLER(fill_any_like, fill_any_like_handler); REGISTER_HANDLER(lookup_table_v2, lookup_table_v2_handler); REGISTER_HANDLER(split, split_handler); REGISTER_HANDLER(one_hot, one_hot_handler); +REGISTER_HANDLER(one_hot_v2, one_hot_v2_handler); } // namespace } // namespace ipu diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 14f516235a720c1fb8f46fe6606ac8f0bdb149f9..57d6c5e119ccfa51a40d9f34d47c070c347d8546 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -51,16 +51,20 @@ XPUOpMap& get_kl2_ops() { {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"conv2d_transpose_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"conv2d_transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"depthwise_conv2d_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"depthwise_conv2d", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"dropout_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e104170ca24954b356232af4e0c3e97cfb222858..2c5f24d28c6d6b87172fe9e4459908c57c3638db 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -916,6 +916,11 @@ class DeviceContextPool { size_t size() const { return device_contexts_.size(); } + const std::map>>& + device_contexts() const { + return device_contexts_; + } + private: static DeviceContextPool* pool; std::map>> diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index a23bb1230e128657e0bd416d7e1875997e6cf6e8..6817fa4bf04b9fa7e0b95ebcd5fef062fc72d7c6 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -62,8 +62,12 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) { return pybind11::detail::npy_api::NPY_INT32_; case phi::DataType::INT64: return pybind11::detail::npy_api::NPY_INT64_; + case phi::DataType::BFLOAT16: + return pybind11::detail::NPY_UINT16_; case phi::DataType::FLOAT16: return pybind11::detail::NPY_FLOAT16_; + case phi::DataType::BFLOAT16: + return pybind11::detail::NPY_UINT16_; case phi::DataType::FLOAT32: return pybind11::detail::npy_api::NPY_FLOAT_; case phi::DataType::FLOAT64: diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ed42d0792eafbc8661883a7e8d5b396fac14686f..bbaa7e3dd6471587c82d271ef881276818dd1b79 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -4264,6 +4264,7 @@ All parameter, weight, gradient are variables in Paddle. platform::ipu::IpuBackend::GetInstance()); }, py::return_value_policy::reference) + .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost) .def("detach", &platform::ipu::IpuBackend::Detach) .def("reset", &platform::ipu::IpuBackend::Reset) .def("set_scope", &platform::ipu::IpuBackend::SetScope) @@ -4311,6 +4312,15 @@ All parameter, weight, gradient are variables in Paddle. option_name, option.first.cast(), option.second.cast()); } + } else if (option_name == "accumulate_outer_fragment") { + for (auto option : element.second.cast()) { + std::vector values; + for (auto value : option.second.cast()) { + values.push_back(value.cast()); + } + self.SetAccumulateOuterFragmentSettings( + option.first.cast(), values); + } } else if (option_name == "custom_op") { std::string paddle_op; std::string popart_op; diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index 0500a8123044cd05695c5167b1afaa48a6027b57..5ac51fb67155780fe5a3f168e6bfede7c7569175 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator { auto arg = predict_func.getArgument(i); auto type = arg.getType(); // this param is TensorMap - if (type.isa()) { + if (type.isa()) { auto* value = new host_context::Value(std::move(*map)); arguments_.push_back(value); AddValue(predict_func.getArgument(i), value); diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index 59df4e9697370e9d8db4bbc0a5d69e8ef03950a5..822a4879e6f59c07c189ea8e935832d4c26cc6d1 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -106,7 +106,7 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> { // input path of model params. let arguments = (ins StrAttr:$path); - let results = (outs DenseTensorMap:$out); + let results = (outs DenseHostTensorMap:$out); let assemblyFormat = "`(``)`attr-dict"; } @@ -121,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { // input path of model params. let arguments = (ins - DenseTensorMap:$map, + DenseHostTensorMap:$map, StrAttr:$name ); let results = (outs DenseTensor:$output); @@ -136,7 +136,7 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { An operation that get the size of a TensorMap. }]; - let arguments = (ins DenseTensorMap:$map); + let arguments = (ins DenseHostTensorMap:$map); let results = (outs I32:$size); let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)"; } diff --git a/paddle/infrt/dialect/infrt/ir/infrt_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td index 86cfc375330b19878528645a2e810efb797e153f..9b1d2132292df708b7c170442be702417593cfb4 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_base.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td @@ -83,7 +83,7 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { ); } -def DenseTensorMap : Infrt_Type<"DenseTensorMap"> { +def DenseHostTensorMap : Infrt_Type<"DenseHostTensorMap"> { let summary = "infrt dense tensor map"; let description = [{dense_tensor map}]; let parameters = (ins); diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index f8d8f514749f802299600acac60b12de70a8d3fe..eb69a95c583f2a7d987e1e5f4617fcd34e0dad28 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -91,7 +91,7 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { parser.getContext(), shape, elementType, lod_level); } if (keyword == "dense_tensor_map") { - return DenseTensorMapType::get(parser.getContext()); + return DenseHostTensorMapType::get(parser.getContext()); } if (keyword == "dense_tensor") { // parse DenseTensor, for example: !i=Infrt.tensor @@ -162,7 +162,7 @@ void InfrtDialect::printType(::mlir::Type type, << lod_tensor_type.getLod_level() << ">"; return; } - if (type.isa()) { + if (type.isa()) { os << "dense_tensor_map"; return; } @@ -180,12 +180,6 @@ void InfrtDialect::printType(::mlir::Type type, os << "tensor_list"; return; } - // print DenseTensorType, for example: !infrt.dense_tensor - if (type.isa()) { - os << "dense_tensor_map"; - return; - } - llvm_unreachable("unknown infrt type."); } diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h index bd258cb1038792e52667b0ef39c65b16c6210eb3..8e831c8c27d5016745744006005ff3690e5bb324 100644 --- a/paddle/infrt/dialect/phi/data_type.h +++ b/paddle/infrt/dialect/phi/data_type.h @@ -23,16 +23,16 @@ namespace infrt { -phi::Backend ConvertTargetToPhi(TargetType target); -TargetType ConvertTargetFromPhi(phi::Backend backend); +::phi::Backend ConvertTargetToPhi(TargetType target); +TargetType ConvertTargetFromPhi(::phi::Backend backend); -phi::DataType ConvertPrecisionToPhi(PrecisionType precision); -PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype); +::phi::DataType ConvertPrecisionToPhi(PrecisionType precision); +PrecisionType ConvertPrecisionFromPhi(::phi::DataType datatype); -phi::DataLayout ConvertLayoutToPhi(LayoutType layout); -LayoutType ConvertLayoutFromPhi(phi::DataLayout layout); +::phi::DataLayout ConvertLayoutToPhi(LayoutType layout); +LayoutType ConvertLayoutFromPhi(::phi::DataLayout layout); -phi::KernelKey ConvertPlaceToPhi(const Place& place); -Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg); +::phi::KernelKey ConvertPlaceToPhi(const Place& place); +Place ConvertPlaceFromPhi(::phi::TensorArgDef tensor_arg); } // namespace infrt diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td index 5d7338ec4292ed49112c3cce45a30816e686886d..8e21283183d036ac26c117a0a209ba92d1f9febc 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td @@ -37,4 +37,8 @@ def Allocator : PHI_Type<"Allocator"> { let assemblyFormat = "`<` $target `>`"; } +def PD_DenseTensorMap : PHI_Type<"DenseTensorMap"> { + let mnemonic = "dense_tensor_map"; +} + #endif diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 1fda2d9d8886008c6415b5a1cf36d53c1500707a..3af7033d2f4c7f434e00d25619df8c5ecf85c759 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -51,12 +51,46 @@ class CreateContextOp let results = (outs Context:$output); } +def PDT_LoadParamsOp : PDT_Op<"load_params", [NoSideEffect]> { + // input path of model params. + let arguments = (ins StrAttr:$path); + let results = (outs PD_DenseTensorMap:$out); + + let assemblyFormat = "`(``)`attr-dict"; +} + +def PDT_LoadCombinedParamsOp : PDT_Op<"load_combined_params", [NoSideEffect]> { + // input path of model params. + let arguments = (ins StrAttr:$model_path, StrAttr:$params_path); + let results = (outs PD_DenseTensorMap:$out); + + let assemblyFormat = "`(``)`attr-dict"; +} + +def PDT_TensorMapGetSizeOp : PDT_Op<"tensor_map_get_size", [NoSideEffect]> { + let arguments = (ins PD_DenseTensorMap:$map); + let results = (outs I32:$size); + let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)"; +} + +class TensorMapGetTensorOp: + PDT_Op<"tensor_map_get_tensor"> { + let arguments = (ins + PD_DenseTensorMap:$map, + StrAttr:$name + ); + let results = (outs DenseTensor:$output); + let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)"; + let verifier = ?; +} + def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">; def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">; def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateCPUContextOp : CreateContextOp<"cpu">; def PDT_CreateGPUContextOp : CreateContextOp<"gpu">; def PDT_PrintDenseTensor : PrintDenseTensorOp; +def PDT_TensorMapGetTensorOp: TensorMapGetTensorOp; def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index bcd44540b336eee6d9a76fc14057e8454b9ae329..7e90f225cffa753fdc8f1ee39cd5fd69d676d8c9 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -351,18 +351,26 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( auto attrs = op->getAttrs(); // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its - // elements - // are sorted by name. The following code adapts the order of function - // signatures - // of the phi operator library. + // elements are sorted by name. The following code adapts the order of + // function signatures of the phi operator library. llvm::SmallVector tmp; tmp.resize(attrs.size()); const std::string& kernel_name = op->getName().getStringRef().str(); const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name); - if (attrs.size() && attr_names.empty()) { - LOG(WARNING) << "The kernel `" << kernel_name - << "` has no specified attr order."; + if (attrs.size()) { + if (attr_names.empty()) { + LOG(WARNING) << "The kernel `" << kernel_name + << "` has not been registered with " + "`KernelRegistry::AddKernelWithAttrs()`."; + } else { + CHECK_EQ(attr_names.size(), attrs.size()) + << "The number of kernel `" << kernel_name + << "` attributes specified by mlir (" << attrs.size() + << ") is inconsistent with the registration (" << attr_names.size() + << ")."; + } } + auto get_offset = [](const char* attr, const std::vector& names, const std::string& kernel_name) -> int { @@ -385,7 +393,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( } else { offset = i; } - CHECK_NE(offset, -1); + CHECK_GT(offset, -1); if (auto v = EmitAttribute(attr.getValue())) { tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 29328520212fd4d020afc28c1e48d2db604414bc..e161dc47075bb3e87399477b3112a4c4c57cec1c 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -79,7 +79,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule( llvm::SmallVector MLIRModelGenImpl::GetModelInputsType( const infrt::paddle::framework_proto::ProgramDesc &program) { llvm::SmallVector operandTypes; - operandTypes.push_back(infrt::DenseTensorMapType::get(context_)); + operandTypes.push_back(infrt::DenseHostTensorMapType::get(context_)); for (auto &op_desc : main_block_.ops()) { if (op_desc.type() != "feed") continue; for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) { diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 1f0b1dabd94d8dcf28e8e0543a8e3b12ed250704..5b92d183b79da21cf9552e8a2f238928962f5832 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -34,6 +34,7 @@ #ifdef INFRT_WITH_PHI #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/backends/host/phi_context.h" +#include "paddle/infrt/tensor/phi/tensor_map.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" @@ -84,22 +85,23 @@ using ValueVariantType = #ifdef INFRT_WITH_GPU backends::GpuPhiContext, ::phi::GPUContext, -#endif +#endif // INFRT_WITH_GPU ::phi::CPUContext, - std::vector, - std::vector, - paddle::experimental::ScalarBase, - paddle::experimental::ScalarArrayBase, - std::vector, - phi::MetaConfig, + std::vector, + std::vector<::phi::DenseTensor*>, + paddle::experimental::ScalarBase<::phi::DenseTensor>, + paddle::experimental::ScalarArrayBase<::phi::DenseTensor>, + std::vector<::phi::MetaTensor*>, + ::phi::MetaConfig, paddle::experimental::Backend, paddle::experimental::DataLayout, paddle::experimental::DataType, + ::infrt::phi::DenseTensorMap, +#endif // INFRT_WITH_PHI #ifdef INFRT_WITH_TRT ::infrt::backends::tensorrt::TrtEngine, ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol, #endif // INFRT_WITH_TRT -#endif std::vector, std::vector, std::vector, @@ -136,6 +138,7 @@ class Value : public common::Object { explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {} explicit Value(MlirFunctionExecutable* x) : data(x) {} #ifdef INFRT_WITH_PHI + explicit Value(::infrt::phi::DenseTensorMap&& x) : data(std::move(x)) {} explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {} explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {} #ifdef INFRT_WITH_GPU diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index 6d16b814c6b02b08e279190d5a685d65c124942d..c8b1bd8c9ebd26bb6f0b4dab4f84c578ab4e5320 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -13,8 +13,11 @@ // limitations under the License. #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" +#include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/kernel/phi/context_kernels.h" +#include "paddle/infrt/paddle/model_parser.h" +#include "paddle/infrt/paddle/scope.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/place.h" @@ -22,6 +25,18 @@ #include #endif +namespace paddle { +namespace platform { +using DeviceContext = ::phi::DeviceContext; +} // namespace platform +namespace framework { +using LoDTensor = ::phi::DenseTensor; +void DeserializeFromStream(std::istream& is, + LoDTensor* tensor, + const platform::DeviceContext& dev_ctx); +} +} // namespace paddle + namespace infrt { namespace kernel { namespace phi { @@ -130,6 +145,89 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { std::cout << "]\n"; #undef PRINT_META_DATA } + +::infrt::phi::DenseTensorMap LoadParams( + host_context::Attribute path) { + const auto& file_path = path.get(); + std::cout << "loading params from: " << file_path << std::endl; + ::infrt::phi::DenseTensorMap map; + + const std::string model_path = file_path + "/__model__"; + auto pb_proto_prog = paddle::LoadProgram(model_path); + auto main_block = pb_proto_prog->blocks(0); + + for (auto& var : main_block.vars()) { + if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) + continue; + std::string param_path = file_path + "/" + var.name(); + std::ifstream param_file(param_path, std::ios::binary); + switch (var.type().type()) { + case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: { + std::unique_ptr<::phi::DenseTensor> tensor{ + std::make_unique<::phi::DenseTensor>()}; + ::phi::CPUContext ctx; + ::paddle::framework::DeserializeFromStream( + param_file, tensor.get(), ctx); + map.SetDenseTensor(var.name(), std::move(tensor)); + } break; + default: { + LOG(WARNING) << "Var `" << var.name() << "` type `" + << static_cast(var.type().type()) + << "` has not been supported now."; + } + } + } + return map; +} + +::infrt::phi::DenseTensorMap LoadCombinedParams( + host_context::Attribute model_path, + host_context::Attribute params_path) { + const auto& model = model_path.get(); + std::cout << "loading params from: " << model << std::endl; + ::infrt::phi::DenseTensorMap map; + + auto pb_proto_prog = paddle::LoadProgram(model); + auto main_block = pb_proto_prog->blocks(0); + + std::ifstream param_file(params_path.get(), std::ios::binary); + + std::set tmp; + for (auto& var : main_block.vars()) { + if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) { + continue; + } + if (var.type().type() == + ::paddle::framework::proto::VarType_Type_LOD_TENSOR) { + tmp.emplace(var.name()); + } else { + llvm_unreachable("the tensor type is illegal."); + } + } + + for (auto& var : tmp) { + std::unique_ptr<::phi::DenseTensor> tensor{ + std::make_unique<::phi::DenseTensor>()}; + ::phi::CPUContext ctx; + ::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx); + map.SetDenseTensor(var, std::move(tensor)); + } + + return map; +} + +::phi::DenseTensor TensorMapGetTensor( + const ::infrt::phi::DenseTensorMap& map, + host_context::Attribute name) { + auto* tensor = map.GetDenseTensor(name.get()); + CHECK(tensor); + return *tensor; +} + +int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) { + return map.size(); +} + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 47d89506e2aa615b0bc425a4c373c904d937e03f..6cfcc6f91be05938952c41812c1ee3fff4456075 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -17,6 +17,7 @@ #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/tensor/phi/tensor_map.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { @@ -41,6 +42,19 @@ void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> values); void PrintDenseTensor(::phi::DenseTensor* dense_tensor); +infrt::phi::DenseTensorMap LoadParams( + host_context::Attribute path); + +::phi::DenseTensor TensorMapGetTensor( + const ::infrt::phi::DenseTensorMap& map, + host_context::Attribute name); + +::infrt::phi::DenseTensorMap LoadCombinedParams( + host_context::Attribute model_path, + host_context::Attribute params_path); + +int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map); + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc index 08c2e19deddfe480faec6d5468b3f222abee7e03..5a314817c242053697ac0d8cda1ad2849b58b4ee 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc @@ -37,15 +37,16 @@ TEST(utils, registry) { CHECK_EQ(count, 2U); } -class FancyAllocator : public phi::Allocator { +class FancyAllocator : public ::phi::Allocator { public: - static void Delete(phi::Allocation* allocation) { + static void Delete(::phi::Allocation* allocation) { ::operator delete(allocation->ptr()); } AllocationPtr Allocate(size_t bytes_size) override { void* data = ::operator new(bytes_size); - auto* allocation = new phi::Allocation(data, bytes_size, phi::CPUPlace()); + auto* allocation = + new ::phi::Allocation(data, bytes_size, ::phi::CPUPlace()); return AllocationPtr(allocation, Delete); } }; @@ -56,20 +57,20 @@ TEST(ElementwiseAdd, launcher_registry) { ASSERT_GE(registry.size(), 1UL); auto creator = registry.GetKernel("phi_cpu.add.float32.any"); - const phi::DDim dims({1, 2}); - const phi::DataType dtype{phi::DataType::FLOAT32}; - const phi::DataLayout layout{phi::DataLayout::NHWC}; - const phi::LoD lod{}; - phi::DenseTensorMeta meta(dtype, dims, layout, lod); + const ::phi::DDim dims({1, 2}); + const ::phi::DataType dtype{::phi::DataType::FLOAT32}; + const ::phi::DataLayout layout{::phi::DataLayout::NHWC}; + const ::phi::LoD lod{}; + ::phi::DenseTensorMeta meta(dtype, dims, layout, lod); - auto fancy_allocator = std::unique_ptr(new FancyAllocator); + auto fancy_allocator = std::unique_ptr<::phi::Allocator>(new FancyAllocator); auto* alloc = fancy_allocator.get(); - phi::DenseTensor a(alloc, meta); - phi::DenseTensor b(alloc, meta); - phi::DenseTensor c(alloc, meta); + ::phi::DenseTensor a(alloc, meta); + ::phi::DenseTensor b(alloc, meta); + ::phi::DenseTensor c(alloc, meta); - auto place = phi::CPUPlace(); + auto place = ::phi::CPUPlace(); float* a_data = a.mutable_data(place); float* b_data = b.mutable_data(place); float* c_data = c.mutable_data(place); @@ -78,7 +79,7 @@ TEST(ElementwiseAdd, launcher_registry) { b_data[i] = 2.f; } - phi::CPUContext context; + ::phi::CPUContext context; context.SetAllocator(alloc); context.Init(); diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 36d40118f16a0bd1779765064caaac6dbe414772..08683d7cb66ad434d4ed52c057eb0c9f4faef6f6 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -53,6 +53,19 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor), {"dims", "lod", "layout", "precision"}); #endif + registry->AddKernelWithAttrs("phi_dt.load_params", + INFRT_KERNEL(infrt::kernel::phi::LoadParams), + {"path"}); + registry->AddKernelWithAttrs( + "phi_dt.load_combined_params", + INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParams), + {"model_path", "params_path"}); + registry->AddKernelWithAttrs( + "phi_dt.tensor_map_get_tensor", + INFRT_KERNEL(infrt::kernel::phi::TensorMapGetTensor), + {"name"}); + registry->AddKernel("phi_dt.tensor_map_get_size", + INFRT_KERNEL(infrt::kernel::phi::TensorMapGetSize)); } } // namespace kernel diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index a9077220cfc709116479a5d91b39d56ad4007af8..407ae16c19c499a5feec269f39f5f907aedc84d4 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -68,14 +68,14 @@ int32_t TensorMapGetSize(TensorMap map) { return map.size(); } // TODO(wilber): Maybe we should place TensorList type in dt dialect. #ifdef INFRT_WITH_PHI -phi::DenseTensor TensorListGetTensor(std::vector list, - Attribute idx) { +::phi::DenseTensor TensorListGetTensor(std::vector<::phi::DenseTensor *> list, + Attribute idx) { CHECK_LT(idx.get(), static_cast(list.size())) << "idx should less than list size"; return *list[idx.get()]; } -int32_t TensorListGetSize(const std::vector &list) { +int32_t TensorListGetSize(const std::vector<::phi::DenseTensor *> &list) { return list.size(); } #endif diff --git a/paddle/infrt/tensor/CMakeLists.txt b/paddle/infrt/tensor/CMakeLists.txt index 95b2e8f6839263cb5228074dd7bc90dc933bc772..95d4090a9a3f7dce6f3f395a4bf1d20362441dd5 100644 --- a/paddle/infrt/tensor/CMakeLists.txt +++ b/paddle/infrt/tensor/CMakeLists.txt @@ -1,5 +1,7 @@ core_gather_headers() +add_subdirectory(phi) + gather_srcs(infrt_src SRCS tensor_map.cc tensor_metadata.cc diff --git a/paddle/infrt/tensor/phi/CMakeLists.txt b/paddle/infrt/tensor/phi/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..97e26661266e930c3429493b3091b5c61fbf6b7e --- /dev/null +++ b/paddle/infrt/tensor/phi/CMakeLists.txt @@ -0,0 +1,3 @@ +gather_srcs(infrt_src SRCS + tensor_map.cc +) diff --git a/paddle/infrt/tensor/phi/tensor_map.cc b/paddle/infrt/tensor/phi/tensor_map.cc new file mode 100644 index 0000000000000000000000000000000000000000..7690322aed4a3dab781aa0baae67b3a5783cbf46 --- /dev/null +++ b/paddle/infrt/tensor/phi/tensor_map.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/tensor/phi/tensor_map.h" +#include "llvm/Support/ErrorHandling.h" + +namespace infrt { +namespace phi { + +void DenseTensorMap::SetDenseTensor( + const std::string& name, std::unique_ptr<::phi::DenseTensor>&& tensor) { + std::lock_guard lock(mu_); + auto it = map_.emplace(std::make_pair(name, std::move(tensor))); + if (!it.second) { + llvm_unreachable("dense tensor map insert failed."); + } +} + +::phi::DenseTensor* DenseTensorMap::GetDenseTensor( + const std::string& name) const { + std::lock_guard lock(mu_); + auto it = map_.find(name); + if (it != map_.end()) { + return it->second.get(); + } + LOG(WARNING) << "can not find `" << name << "` in the tensor map."; + return nullptr; +} + +size_t DenseTensorMap::size() const { + std::lock_guard lock(mu_); + return map_.size(); +} + +} // namespace phi +} // namespace infrt diff --git a/paddle/infrt/tensor/phi/tensor_map.h b/paddle/infrt/tensor/phi/tensor_map.h new file mode 100644 index 0000000000000000000000000000000000000000..1b9fbdd9defc735131c0ead15b38c98dc619650a --- /dev/null +++ b/paddle/infrt/tensor/phi/tensor_map.h @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace phi { + +class DenseTensorMap { + public: + DenseTensorMap() = default; + DenseTensorMap(DenseTensorMap&& other) : map_(std::move(other.map_)) {} + void SetDenseTensor(const std::string& name, + std::unique_ptr<::phi::DenseTensor>&& tensor); + ::phi::DenseTensor* GetDenseTensor(const std::string& name) const; + size_t size() const; + + private: + mutable std::mutex mu_; + std::unordered_map> map_; +}; + +} // namespace phi +} // namespace infrt diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in index 7aeb3f8a4d0513deaed6bda73a591790b633d0db..9e3773edd77b03463d6a736754b5579808c14a90 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in +++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in @@ -12,3 +12,30 @@ func @load_tensor_map() { infrt.return } + +func @load_phi_tensor_map() { + %map = phi_dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"} + %size = phi_dt.tensor_map_get_size(%map) -> i32 + infrt.print.i32 %size + + %a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor + + // CHECK: dense_tensor: shape=shape[2], value=[0,0] + phi_dt.print_tensor (%a : !infrt.dense_tensor) + + infrt.return +} + +func @load_combined_phi_tensor_map() { + %map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdmodel", + params_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdiparams"} + %size = phi_dt.tensor_map_get_size(%map) -> i32 + infrt.print.i32 %size + + %a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor + + // CHECK: dense_tensor: shape=shape[2], value=[0,0] + phi_dt.print_tensor (%a : !infrt.dense_tensor) + + infrt.return +} diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..754833a2ddab3601f61069a916aea05181425c8f --- /dev/null +++ b/paddle/phi/api/include/context_pool.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/macros.h" +#include "paddle/utils/flat_hash_map.h" + +namespace phi { +class DeviceContext; +class CPUContext; +class GPUContext; +} // namespace phi + +namespace paddle { +namespace experimental { + +template +struct DefaultDeviceContextType; + +template <> +struct DefaultDeviceContextType { + using TYPE = phi::CPUContext; +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = phi::GPUContext; +}; + +/** + * The DeviceContextPool here is just a mirror of the DeviceContextPool in + * fluid, and does not manage the life cycle of the DeviceContext. + * It is mainly used for external custom operator calls and high-performance + * C++ APIs. + * + * Since DeviceContextPool in fluid is a global singleton, it always exists + * in program running, so DeviceContextPool here can always access the correct + * DeviceContext pointer. + * + * In order not to depend on the fluid's DeviceContextPool, + * the DeviceContextPool here needs to be initialized in the fluid, and cannot + * be initialized by itself. + */ +class DeviceContextPool { + public: + static DeviceContextPool& Instance(); + + const phi::DeviceContext* Get(const Place& place) const; + + phi::DeviceContext* GetMutable(const Place& place); + + template + const typename DefaultDeviceContextType::TYPE* Get( + const Place& place) const { + return reinterpret_cast::TYPE*>( + Get(place)); + } + + private: + DeviceContextPool(); + paddle::flat_hash_map + context_map_; + + DISABLE_COPY_AND_ASSIGN(DeviceContextPool); +}; + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 4cbca07236208281f38984022d17b6cb88af8ed8..50c267f653564ebee770c058fdf5fb3af14e9c23 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -135,8 +135,9 @@ add_custom_command( cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw) cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) +cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place) -cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory) +cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool) cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1408a88d6ff784039f9e45393d9aec9ff37df2a --- /dev/null +++ b/paddle/phi/api/lib/context_pool.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/api/include/context_pool.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace experimental { + +DeviceContextPool& DeviceContextPool::Instance() { + static DeviceContextPool g_device_context_pool; + return g_device_context_pool; +} + +const phi::DeviceContext* DeviceContextPool::Get(const Place& place) const { + auto it = context_map_.find(place); + PADDLE_ENFORCE_NE( + it, + context_map_.end(), + phi::errors::NotFound("The DeviceContext of %s does not exists.", place)); + return it->second; +} + +phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) { + return const_cast(Get(place)); +} + +DeviceContextPool::DeviceContextPool() { + // We need to make sure that the correct value exists + // whenever we get the DeviceContext from DeviceContextPool + const auto& device_contexts = + paddle::platform::DeviceContextPool::Instance().device_contexts(); + for (const auto& pair : device_contexts) { + // only get CPU and GPU DeviceContext now, add other DeviceContext type + // later if needed + if (platform::is_cpu_place(pair.first) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + || + platform::is_gpu_place(pair.first)) { +#else + ) { +#endif + const phi::DeviceContext* dev_ctx = pair.second.get().get(); + VLOG(3) << "Init phi DeviceContextPool: insert {" << pair.first << ", " + << dev_ctx << "}"; + context_map_[pair.first] = dev_ctx; + } + } +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index e280ab626da74a9b0951925f7472fa49996691cb..8bf5f3b481a0e041b439ffd99a8ac017f4aae50e 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -180,6 +180,7 @@ std::shared_ptr PrepareData( const phi::TensorArgDef& target_args_def, const TransformFlag& transform_flag) { const auto& tensor_in = input.impl(); + VLOG(6) << tensor_in->dtype() << "\t" << target_args_def.dtype; if (!transform_flag.NeedTransform() || !tensor_in->initialized() || (!NeedTransformPlace( tensor_in->place(), target_args_def.backend, transform_flag) && diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index 0e3ca1af4967c2bf2ae302ea656a31198d187f01..5e334b9b727dc19f6bba7dc8c9c15b2414d13abb 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/include/context_pool.h" #include "paddle/phi/core/compat/convert_utils.h" namespace paddle { @@ -52,8 +53,8 @@ std::size_t CountLeadingZeros(uint64_t val) { } // namespace detail phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) { - auto& pool = paddle::platform::DeviceContextPool::Instance(); - return pool.Get(phi::TransToPhiPlace(backend)); + auto& pool = paddle::experimental::DeviceContextPool::Instance(); + return pool.GetMutable(phi::TransToPhiPlace(backend)); } DataType ParseDataType(DataType dtype) { return dtype; } diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index 644bf3679af2a3ebf05f739a6e8d42011c7e664c..2b5254d3d5f142b5269a667a7a9bf6780c9bb932 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -92,4 +92,20 @@ std::string GetGlobalDeviceType(size_t device_type_id) { return global_registered_device_type[device_type_id]; } +constexpr static int kAllocationTypeBitLength = 8; +constexpr static int kDeviceTypeIDBitLength = 8; +constexpr static int kDeviceIDBitLength = 8; + +uint32_t Place::Hash::operator()(const Place &place) const { + uint32_t hash_value = 0; + // |----31-24------|-----23-16------|-----15-08----|---7-0----| + // | For extension | AllocationType | DeviceTypeID | DeviceID | + hash_value |= (static_cast(place.alloc_type_) + << (kDeviceIDBitLength + kDeviceTypeIDBitLength)); + hash_value |= + (static_cast(place.device_type_id_) << kDeviceIDBitLength); + hash_value |= static_cast(place.device); + return hash_value; +} + } // namespace phi diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index 36fb910cad6c705952a0e3858eb09810d1ea6f5f..53ddd499a7e24e09ded721619407b8679cab7628 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -73,31 +73,23 @@ class Place { std::string DebugString() const; + struct Hash { + // Note: Now the number of bits we need does not exceed 32 bits, so there is + // no need to use 64 bits. If needed in the future, it can be expanded, + // but now we don’t over-design. + uint32_t operator()(const Place& place) const; + }; + + uint32_t HashValue() const { return Hash()(*this); } + inline bool operator==(const Place& rhs) const { - if (alloc_type_ != rhs.GetType()) { - return false; - } - if (alloc_type_ == AllocationType::CPU || - alloc_type_ == AllocationType::GPUPINNED || - alloc_type_ == AllocationType::NPUPINNED) { - return true; - } - if (alloc_type_ == AllocationType::CUSTOM) { - return device_type_id_ == rhs.device_type_id_ && - device == rhs.GetDeviceId(); - } - return device == rhs.GetDeviceId(); + return HashValue() == rhs.HashValue(); + } + inline bool operator!=(const Place& rhs) const { + return HashValue() != rhs.HashValue(); } - inline bool operator!=(const Place& rhs) const { return !(*this == rhs); } inline bool operator<(const Place& rhs) const { - if (alloc_type_ != rhs.GetType()) { - return static_cast(alloc_type_) < static_cast(rhs.GetType()); - } - if (alloc_type_ == AllocationType::CUSTOM && - device_type_id_ != rhs.device_type_id_) { - return device_type_id_ < rhs.device_type_id_; - } - return device < rhs.GetDeviceId(); + return HashValue() < rhs.HashValue(); } public: @@ -206,3 +198,10 @@ class CustomPlace : public Place { std::ostream& operator<<(std::ostream&, const Place&); } // namespace phi + +namespace paddle { +namespace experimental { +using AllocationType = phi::AllocationType; +using Place = phi::Place; +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 7a0f50533360d71e8cd025a520d753c366c08edb..2e185fc0ca22bce314906cc3c6043ad0e0912cac 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -110,8 +110,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator, template const T* DenseTensor::data() const { check_memory_size(); - PADDLE_ENFORCE( - (dtype() == paddle::experimental::CppTypeToDataType::Type()), + PADDLE_ENFORCE_EQ( + dtype(), + paddle::experimental::CppTypeToDataType::Type(), phi::errors::InvalidArgument( "The type of data we are trying to retrieve does not match the " "type of data currently contained in the container.")); diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 37d1a234b5767a3873bda6b41e6e410df1c452af..b680222f863505c57464abc3153ea9ff6ca19f6b 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -64,6 +64,45 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, } } +void ConvTransposeGradInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const MetaTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + MetaTensor* dx, + MetaTensor* dfilter) { + GeneralBinaryGradInferMeta(x, filter, dx, dfilter); +} + +void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const MetaTensor& dout, + const MetaTensor& ddx, + const MetaTensor& ddfilter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + MetaTensor* dx, + MetaTensor* dfilter, + MetaTensor* ddout) { + GeneralBinaryGradInferMeta(x, filter, dx, dfilter); + + if (ddout) { + ddout->share_meta(dout); + } +} + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 260fbfe7197912fd3dd5b9103a0a991a45d55816..5c49a58a715a40842ea321169e200adf54124805 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -17,6 +17,9 @@ limitations under the License. */ #include #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/infermeta/multiary.h" +#include "paddle/phi/infermeta/ternary.h" #include "paddle/phi/infermeta/unary.h" namespace phi { @@ -34,6 +37,37 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, MetaTensor* dweight, MetaTensor* dbias); +void ConvTransposeGradInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const MetaTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + MetaTensor* dx, + MetaTensor* dfilter); + +void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const MetaTensor& dout, + const MetaTensor& ddx, + const MetaTensor& ddfilter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + MetaTensor* dx, + MetaTensor* dfilter, + MetaTensor* ddout); + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index aabb944db30b9f30394f092c245bc0307d8bbf3f..36a049eca0f30c4d5d292d23b94cbead53c71208 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -17,8 +17,10 @@ limitations under the License. */ #include #include #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/cpu/conv_util.h" @@ -312,51 +314,6 @@ void CompareAllInferMeta(const MetaTensor& x, out->set_dtype(DataType::BOOL); } -void CrossInferMeta(const MetaTensor& x, - const MetaTensor& y, - int axis, - MetaTensor* out) { - auto x_dim = x.dims(); - auto y_dim = y.dims(); - auto dim = axis; - - bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); - PADDLE_ENFORCE_EQ( - dims_match, - true, - phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " - "the 'shape' of Input(Y). But received " - "Input(X).dimensions = [%s], " - "Input(Y).dimensions = [%s]", - x_dim, - y_dim)); - - if (dim != DDim::kMaxRank) { - PADDLE_ENFORCE_EQ( - dim < x_dim.size() && dim >= (0 - x_dim.size()), - true, - phi::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - x_dim.size(), - x_dim.size() - 1, - dim)); - if (dim < 0) { - dim += x_dim.size(); - } - PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, - true, - phi::errors::InvalidArgument( - "Input(X/Y).dims()[dim] should be equal to 3." - "But received Input(X/Y).dims()[dim] = %d.", - x_dim[dim])); - } - out->set_dims(x_dim); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - void ConvInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, @@ -512,6 +469,241 @@ void ConvInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } +void ConvTransposeInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto x_dims = x.dims(); + auto filter_dims = filter.dims(); + + std::vector paddings_ = paddings; + std::vector dilations_ = dilations; + + const DataLayout data_layout = + config.is_run_mkldnn_kernel + ? DataLayout::kNCHW + : paddle::framework::StringToDataLayout(data_format); + + PADDLE_ENFORCE_EQ( + x_dims.size() == 4 || x_dims.size() == 5, + true, + errors::InvalidArgument("Input of Op(conv_transpose) should be 4-D or " + "5-D Tensor. But received: %u-D Tensor, " + "the shape of input is [%s]", + x_dims.size(), + x_dims)); + PADDLE_ENFORCE_EQ( + x_dims.size(), + filter_dims.size(), + errors::InvalidArgument( + "The input's dimension size and filter's dimension size of " + "Op (conv_transpose) should be equal. But received: the shape of " + "input is [%s], the dimension size of input is [%d], the shape " + "of filter is [%s], the dimension size of filter is [%d]. ", + x_dims, + x_dims.size(), + filter_dims, + filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], + 0, + errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = x_dims.size() - stride_size; + + PADDLE_ENFORCE_EQ( + x_dims.size() - strides.size(), + 2U, + errors::InvalidArgument( + "The input's dimension size minus Attr(stride)'s size must " + "be euqal to 2 for Op(conv_transpose). But received: [%d], the " + "input's dimension size is [%d], the shape of input " + "is [%s], the Attr(stride)'s size is [%d].", + in_sub_stride_size, + x_dims.size(), + x_dims, + strides.size())); + if (output_size.size()) + PADDLE_ENFORCE_EQ( + output_size.size(), + strides.size(), + errors::InvalidArgument( + "The Attr(output_size) and Attr(stride) of Op(conv_transpose) " + "should be the same.")); + if (output_padding.size()) + PADDLE_ENFORCE_EQ( + output_padding.size(), + strides.size(), + errors::InvalidArgument( + "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) " + "should be the same.")); + + const int64_t C = + (data_layout != DataLayout::kNHWC ? x_dims[1] + : x_dims[x_dims.size() - 1]); + PADDLE_ENFORCE_EQ( + C, + filter_dims[0], + errors::InvalidArgument( + "The number of input channels should be equal to filter channels " + "for Op(conv_transpose). But received: the input's channels is " + "[%d], the shape of input is [%s], the filter's channels is [%d], " + "the shape of filter is [%s]. The data_format is %s." + "The error may come from wrong data_format setting.", + C, + x_dims, + filter_dims[0], + filter_dims, + data_format)); + + DDim x_data_dims; + if (data_layout != DataLayout::kNHWC) { + x_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } else { + x_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize); + + std::vector output_shape({x_dims[0]}); + if (data_layout != DataLayout::kNHWC) { + output_shape.push_back(filter_dims[1] * groups); + } + const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations_[i] * (filter_dims[i + 2] - 1) + 1; + auto infer_shape = (config.is_runtime || x_dims[i + offset] > 0) + ? (x_dims[i + offset] - 1) * strides[i] - + paddings_[2 * i] - paddings_[2 * i + 1] + + filter_extent + : -1; + if (output_size.size()) { + if (config.is_runtime) { + PADDLE_ENFORCE_GE( + output_size[i], + infer_shape, + errors::InvalidArgument( + "output_size of Op(ConvTransposeOp) should not be " + "less than the infered output size. But received output_size = " + "[%s], whose dim %d is less than the infered output size [%s]", + make_ddim(output_size).to_str(), + i, + infer_shape)); + PADDLE_ENFORCE_LT( + output_size[i], + infer_shape + strides[i], + errors::InvalidArgument( + "output_size of Op(ConvTransposeOp) should be less " + "than infered size + stride. But received output_size = [%s], " + "whose dim %d is not less than the infered output size (%d) + " + "stride (%d) = %d", + make_ddim(output_size).to_str(), + i, + infer_shape, + strides[i], + infer_shape + strides[i])); + } + output_shape.push_back(output_size[i]); + } else if (output_padding.size()) { + if (config.is_runtime) { + PADDLE_ENFORCE_GE( + output_padding[i], + 0, + errors::InvalidArgument( + "output_padding of Op(ConvTransposeOp) should not be " + "less than the 0. But received output_padding = " + "[%s], whose dim %d is less than 0", + make_ddim(output_padding).to_str(), + i)); + PADDLE_ENFORCE_LT( + output_padding[i], + std::max(strides[i], dilations_[i]), + errors::InvalidArgument( + "output_padding of Op(ConvTransposeOp) should be less " + "than either stride or dilation. But received output_size = " + "[%s], " + "whose dim %d is not less than either stride (%d) or " + "dilation (%d)", + make_ddim(output_size).to_str(), + i, + strides[i], + dilations_[i])); + } + output_shape.push_back((infer_shape + output_padding[i])); + } else { + output_shape.push_back(infer_shape); + } + } + if (data_layout == DataLayout::kNHWC) { + output_shape.push_back(filter_dims[1] * groups); + } + + out->set_dims(make_ddim(output_shape)); + out->set_dtype(x.dtype()); +} + +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out) { + auto x_dim = x.dims(); + auto y_dim = y.dims(); + auto dim = axis; + + bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); + PADDLE_ENFORCE_EQ( + dims_match, + true, + phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " + "the 'shape' of Input(Y). But received " + "Input(X).dimensions = [%s], " + "Input(Y).dimensions = [%s]", + x_dim, + y_dim)); + + if (dim != DDim::kMaxRank) { + PADDLE_ENFORCE_EQ( + dim < x_dim.size() && dim >= (0 - x_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + x_dim.size(), + x_dim.size() - 1, + dim)); + if (dim < 0) { + dim += x_dim.size(); + } + PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, + true, + phi::errors::InvalidArgument( + "Input(X/Y).dims()[dim] should be equal to 3." + "But received Input(X/Y).dims()[dim] = %d.", + x_dim[dim])); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index d770a096de7c922c674b7edda55ae8cb531a6d00..9a54c4c5fa62d4c58e527b9efbf2e977f72354ec 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -83,6 +83,19 @@ void ConvInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void ConvTransposeInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 3e9da9a217a0a8837d7edadc70401fdad04b4869..3faf42fe1ab1a27e8d2ffafc4847b37aa6e700b8 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -305,11 +305,48 @@ void BatchNormInferMeta(const MetaTensor& x, y->set_dims(x_dims); mean_out->set_dims({C}); variance_out->set_dims({C}); - saved_mean->set_dims({C}); - saved_variance->set_dims({C}); + if (saved_mean) { + saved_mean->set_dims({C}); + } + if (saved_variance) { + saved_variance->set_dims({C}); + } y->share_lod(x); } +void BatchNormInferInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaConfig config) { + BatchNormInferMeta(x, + scale, + bias, + mean, + variance, + momentum, + epsilon, + data_layout, + /*is_test=*/true, + /*use_global_stats=*/false, + /*trainable_statistics=*/false, + /*fuse_with_relu=*/false, + y, + mean_out, + variance_out, + /*saved_mean=*/nullptr, + /*saved_variance=*/nullptr, + /*reserve_space=*/nullptr, + config); +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -689,3 +726,4 @@ void WhereInferMeta(const MetaTensor& condition, } // namespace phi PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta); +PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 068766c0e11671c93285c077ab2328ac20134a13..e9b5d8c872fb9226802a1f331bd4b44a6039e208 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -92,6 +92,19 @@ void BatchNormInferMeta(const MetaTensor& x, MetaTensor* reserve_space, MetaConfig config = MetaConfig()); +void BatchNormInferInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaConfig config = MetaConfig()); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 0f51839553158b6dce7ac90006c5c72ee8e3b57b..7c5f38744f8923805d1e9b521c58813293cdce9b 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -877,6 +877,77 @@ void PadInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } +void Pad3dInferMeta(const MetaTensor& x, + const ScalarArray& paddings_scalar_array, + const std::string& mode, + float value, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto x_dim = x.dims(); + PADDLE_ENFORCE_EQ(x_dim.size(), + 5, + errors::InvalidArgument( + "The size of Input(X)'s dimension should be equal to " + "5, but received %d. ", + x_dim.size())); + + std::vector out_dims(x_dim.size()); + out_dims[0] = x_dim[0]; + if (paddings_scalar_array.FromTensor()) { + if (config.is_runtime) { + PADDLE_ENFORCE_EQ( + paddings_scalar_array.GetData().size(), + 6, + errors::InvalidArgument("Shape of Input(Paddings) should be equal to " + "[6], but received [%d].", + paddings_scalar_array.GetData().size())); + } + out_dims[1] = x_dim[1]; + out_dims[2] = x_dim[2]; + out_dims[3] = x_dim[3]; + } else { + auto paddings = paddings_scalar_array.GetData(); + + PADDLE_ENFORCE_EQ( + paddings.size(), + 6, + errors::InvalidArgument( + "Size of paddings should be equal to 6, but received %d.", + static_cast(paddings.size()))); + if (data_format == "NCDHW") { + out_dims[1] = x_dim[1]; // channel + out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0)) + ? x_dim[2] + : (x_dim[2] + paddings[4] + paddings[5]); // depth + + out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0)) + ? x_dim[3] + : (x_dim[3] + paddings[2] + paddings[3]); // height + + out_dims[4] = ((!config.is_runtime) && (x_dim[4] < 0)) + ? x_dim[4] + : (x_dim[4] + paddings[0] + paddings[1]); // width + } else { // NDHWC + out_dims[4] = x_dim[4]; // channel + + out_dims[1] = ((!config.is_runtime) && (x_dim[1] < 0)) + ? x_dim[1] + : (x_dim[1] + paddings[4] + paddings[5]); // depth + out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0)) + ? x_dim[2] + : (x_dim[2] + paddings[2] + paddings[3]); // height + out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0)) + ? x_dim[3] + : (x_dim[3] + paddings[0] + paddings[1]); // width + } + } + + out->set_dims(phi::make_ddim(out_dims)); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + void PixelShuffleInferMeta(const MetaTensor& x, int upscale_factor, const std::string& data_format, @@ -1668,6 +1739,17 @@ void TransposeInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void TransposeGradInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out) { + std::vector reversed_axis(axis); + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + + TransposeInferMeta(x, reversed_axis, out); +} + void UnbindInferMeta(const MetaTensor& x, int axis, std::vector* outs) { @@ -1907,6 +1989,7 @@ void OneHotInferMeta(const MetaTensor& x, auto out_dims = phi::make_ddim(out_dims_vec); out->set_dims(out_dims); out->share_lod(x); + out->set_dtype(phi::DataType::FLOAT32); } diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 2d51bac995d5142871873dd4a12c22b4bf2de55e..d84283a65c4d19445dce61e9cf8ee6f70a83905f 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -147,6 +147,14 @@ void PadInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void Pad3dInferMeta(const MetaTensor& x, + const ScalarArray& paddings, + const std::string& mode, + float value, + const std::string& data_format, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void PixelShuffleInferMeta(const MetaTensor& x, int upscale_factor, const std::string& data_format, @@ -255,6 +263,10 @@ void TransposeInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); +void TransposeGradInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out); + void UnbindInferMeta(const MetaTensor& x, int axis, std::vector* outs); diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..a0de7842b9e0d417296ab7e965397a691041a679 --- /dev/null +++ b/paddle/phi/kernels/batch_norm_kernel.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/batch_norm_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +template +void BatchNormInferKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out) { + // Since saved_mean and saved_variance are used regardless of whether + // they are in test mode, temporary variables need to be created here + // to be compatible + auto saved_mean = phi::EmptyLike(dev_ctx, *mean_out); + auto saved_variance = phi::EmptyLike(dev_ctx, *variance_out); + BatchNormKernel(dev_ctx, + x, + scale, + bias, + mean, + variance, + momentum, + epsilon, + data_layout, + /*is_test=*/true, + /*use_global_stats=*/false, + /*trainable_statistics=*/false, + /*fuse_with_relu=*/false, + y, + mean_out, + variance_out, + &saved_mean, + &saved_variance, + /*reserve_space=*/nullptr); +} + +} // namespace phi + +PD_REGISTER_KERNEL(batch_norm_infer, + CPU, + ALL_LAYOUT, + phi::BatchNormInferKernel, + float, + double) {} +#ifdef PADDLE_WITH_CUDA +PD_REGISTER_KERNEL(batch_norm_infer, + GPU, + ALL_LAYOUT, + phi::BatchNormInferKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} +#endif +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm_infer, + GPU, + ALL_LAYOUT, + phi::BatchNormInferKernel, + float, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h index 7ddf32e27c7d73a7249d92f7835afdf6b8f3ed5a..be589e43647c1c6b433deec27465c328a67b89e5 100644 --- a/paddle/phi/kernels/batch_norm_kernel.h +++ b/paddle/phi/kernels/batch_norm_kernel.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -40,4 +41,18 @@ void BatchNormKernel(const Context& dev_ctx, DenseTensor* saved_variance, DenseTensor* reserve_space); +template +void BatchNormInferKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out); + } // namespace phi diff --git a/paddle/phi/kernels/conv_transpose_grad_kernel.h b/paddle/phi/kernels/conv_transpose_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2b1c0c1a934cf64dad552b36ce9cfd3808be6810 --- /dev/null +++ b/paddle/phi/kernels/conv_transpose_grad_kernel.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Conv2dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter); + +template +void Conv2dTransposeDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const DenseTensor& ddx, + const DenseTensor& ddfilter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter, + DenseTensor* ddout); + +template +void Conv3dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter); + +template +void DepthwiseConv2dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter); + +} // namespace phi diff --git a/paddle/phi/kernels/conv_transpose_kernel.h b/paddle/phi/kernels/conv_transpose_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..de56f13ddf73e5c33e49227468e565e374d14c84 --- /dev/null +++ b/paddle/phi/kernels/conv_transpose_kernel.h @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Conv2dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out); + +template +void Conv3dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out); + +template +void DepthwiseConv2dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..8d0749500695c5db2f07872e59d295981c598c9e --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_transpose_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DepthwiseConv2dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + ConvTransposeGradRawKernel(ctx, + x, + filter, + dout, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + dx, + dfilter); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_transpose_grad, + CPU, + ALL_LAYOUT, + phi::Conv2dTransposeGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(conv3d_transpose_grad, + CPU, + ALL_LAYOUT, + phi::Conv3dTransposeGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad, + CPU, + ALL_LAYOUT, + phi::DepthwiseConv2dTransposeGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4cacc850938ea87c34499c68b3aa8821e65943d --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_transpose_kernel.h" +#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DepthwiseConv2dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + ConvTransposeRawKernel(ctx, + x, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_transpose, + CPU, + ALL_LAYOUT, + phi::Conv2dTransposeKernel, + float, + double) {} +PD_REGISTER_KERNEL(conv3d_transpose, + CPU, + ALL_LAYOUT, + phi::Conv3dTransposeKernel, + float, + double) {} +PD_REGISTER_KERNEL(depthwise_conv2d_transpose, + CPU, + ALL_LAYOUT, + phi::DepthwiseConv2dTransposeKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..338be9e252da3349cd81cdfa61a8eae4d2d30166 --- /dev/null +++ b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h" +#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(frobenius_norm_grad, + CPU, + ALL_LAYOUT, + phi::FrobeniusNormGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..77509b953bf39bc472b9f3e8b134b294253d1998 --- /dev/null +++ b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/frobenius_norm_kernel.h" +#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + frobenius_norm, CPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc index 21bf9faee13cfa4da271a7d1b1a9fe482a55da04..b895e4aa7c0e7c3315092a8603a4390cd5f76792 100644 --- a/paddle/phi/kernels/cpu/index_sample_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc @@ -41,7 +41,7 @@ void IndexSampleInner(const Context &context, std::vector input_vec; std::vector index_vec; paddle::framework::TensorToVector(input, context, &input_vec); - paddle::framework::TensorToVector(index, context, &index_vec); + paddle::framework::TensorToVector(index, context, &index_vec); std::vector res(index_ids_num); for (int i = 0; i < index_ids_num; i++) { diff --git a/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..b1adb3e206da97918dc69ee4694de1be525b382e --- /dev/null +++ b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc @@ -0,0 +1,480 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad3d_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConstPad3DGradNCDHW(T* d_in_data, + const T* d_out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || + in_h >= in_height || in_w >= in_width)) { + d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] = + d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; + } +} + +template +void ConstPad3DGradNDHWC(T* d_in_data, + const T* d_out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || + in_h >= in_height || in_w >= in_width)) { + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + d_in_data[in_index + c] = d_out_data[out_index + c]; + } + } +} + +template +void ReflectPad3DGradNCDHW(T* d_in_data, + const T* d_out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = std::max(in_d, -in_d); // reflect by 0 + in_d = std::min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth + in_h = std::max(in_h, -in_h); // reflect by 0 + in_h = std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height + in_w = std::max(in_w, -in_w); // reflect by 0 + in_w = std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width + + d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += + d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; +} + +template +void ReflectPad3DGradNDHWC(T* d_in_data, + const T* d_out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = std::max(in_d, -in_d); + in_d = std::min(in_d, 2 * in_depth - in_d - 2); + in_h = std::max(in_h, -in_h); + in_h = std::min(in_h, 2 * in_height - in_h - 2); + in_w = std::max(in_w, -in_w); + in_w = std::min(in_w, 2 * in_width - in_w - 2); + + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + d_in_data[in_index + c] += d_out_data[out_index + c]; + } +} + +template +void ReplicatePad3DGradNCDHW(T* d_in_data, + const T* d_out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + + d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += + d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; +} + +template +void ReplicatePad3DGradNDHWC(T* d_in_data, + const T* d_out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + d_in_data[in_index + c] += d_out_data[out_index + c]; + } +} + +template +void CircularPad3DGradNCDHW(T* d_in_data, + const T* d_out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += + d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; +} + +template +void CircularPad3DGradNDHWC(T* d_in_data, + const T* d_out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w) { + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + d_in_data[in_index + c] += d_out_data[out_index + c]; + } +} + +template +void Pad3DGradNCDHW(T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data, + void (*pad_func)(T*, + const T*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int)) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_d = 0; out_d < out_depth; ++out_d) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + pad_func(d_in_data, + d_out_data, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_d, + out_h, + out_w); + } + } + } + d_in_data += in_depth * in_height * in_width; + d_out_data += out_depth * out_height * out_width; + } + } +} + +template +void Pad3DGradNDHWC(T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data, + void (*pad_func)(T*, + const T*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int)) { + for (int n = 0; n < num; ++n) { + for (int out_d = 0; out_d < out_depth; ++out_d) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + pad_func(d_in_data, + d_out_data, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_d, + out_h, + out_w); + } + } + } + d_in_data += in_depth * in_height * in_width * channels; + d_out_data += out_depth * out_height * out_width * channels; + } +} + +template +void Pad3dGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* x_grad) { + std::vector pads = paddings.GetData(); + + auto* d_out = &out_grad; + auto* d_in = x_grad; + auto d_in_dims = d_in->dims(); + auto d_out_dims = d_out->dims(); + const T* d_out_data = d_out->data(); + T* d_in_data = dev_ctx.template Alloc(d_in); + phi::funcs::SetConstant()(dev_ctx, d_in, static_cast(0)); + + const int pad_left = pads[0]; + const int pad_top = pads[2]; + const int pad_front = pads[4]; + const int num = d_in_dims[0]; + if (data_format == "NCDHW") { + const int channels = d_in_dims[1]; + const int in_depth = d_in_dims[2]; + const int in_height = d_in_dims[3]; + const int in_width = d_in_dims[4]; + const int out_depth = d_out_dims[2]; + const int out_height = d_out_dims[3]; + const int out_width = d_out_dims[4]; + + std::map + func_map; + + func_map["reflect"] = ReflectPad3DGradNCDHW; + func_map["replicate"] = ReplicatePad3DGradNCDHW; + func_map["circular"] = CircularPad3DGradNCDHW; + func_map["constant"] = ConstPad3DGradNCDHW; + + Pad3DGradNCDHW(d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data, + func_map[mode]); + } else { + const int channels = d_in_dims[4]; + const int in_depth = d_in_dims[1]; + const int in_height = d_in_dims[2]; + const int in_width = d_in_dims[3]; + const int out_depth = d_out_dims[1]; + const int out_height = d_out_dims[2]; + const int out_width = d_out_dims[3]; + + std::map + func_map; + + func_map["reflect"] = ReflectPad3DGradNDHWC; + func_map["replicate"] = ReplicatePad3DGradNDHWC; + func_map["circular"] = CircularPad3DGradNDHWC; + func_map["constant"] = ConstPad3DGradNDHWC; + + Pad3DGradNDHWC(d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data, + func_map[mode]); + } +} +} // namespace phi + +PD_REGISTER_KERNEL( + pad3d_grad, CPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/pad3d_kernel.cc b/paddle/phi/kernels/cpu/pad3d_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..0dc01f485f3aa9ba6ff0b6d089887ff04847054c --- /dev/null +++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc @@ -0,0 +1,578 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad3d_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void ConstPad3DFuncNCDHW(const T* in_data, + T* out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + out_data[out_d * out_height * out_width + out_h * out_width + out_w] = + (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || + in_h >= in_height || in_w >= in_width) + ? value + : in_data[in_d * in_height * in_width + in_h * in_width + in_w]; +} + +template +void ConstPad3DFuncNDHWC(const T* in_data, + T* out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || + in_h >= in_height || in_w >= in_width) { + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = value; + } + } else { + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = in_data[in_index + c]; + } + } +} + +template +void ReflectPad3DFuncNCDHW(const T* in_data, + T* out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = std::max(in_d, -in_d); // reflect by 0 + in_d = std::min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth + in_h = std::max(in_h, -in_h); // reflect by 0 + in_h = std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height + in_w = std::max(in_w, -in_w); // reflect by 0 + in_w = std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width + + out_data[out_d * out_height * out_width + out_h * out_width + out_w] = + in_data[in_d * in_height * in_width + in_h * in_width + in_w]; +} + +template +void ReflectPad3DFuncNDHWC(const T* in_data, + T* out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = std::max(in_d, -in_d); + in_d = std::min(in_d, 2 * in_depth - in_d - 2); + in_h = std::max(in_h, -in_h); + in_h = std::min(in_h, 2 * in_height - in_h - 2); + in_w = std::max(in_w, -in_w); + in_w = std::min(in_w, 2 * in_width - in_w - 2); + + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = in_data[in_index + c]; + } +} + +template +void ReplicatePad3DFuncNCDHW(const T* in_data, + T* out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + + out_data[out_d * out_height * out_width + out_h * out_width + out_w] = + in_data[in_d * in_height * in_width + in_h * in_width + in_w]; +} + +template +void ReplicatePad3DFuncNDHWC(const T* in_data, + T* out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = in_data[in_index + c]; + } +} + +template +void CircularPad3DFuncNCDHW(const T* in_data, + T* out_data, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + + out_data[out_d * out_height * out_width + out_h * out_width + out_w] = + in_data[in_d * in_height * in_width + in_h * in_width + in_w]; +} + +template +void CircularPad3DFuncNDHWC(const T* in_data, + T* out_data, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const int out_d, + const int out_h, + const int out_w, + const T value) { + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + + const int out_index = + (out_d * out_height * out_width + out_h * out_width + out_w) * channels; + const int in_index = + (in_d * in_height * in_width + in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = in_data[in_index + c]; + } +} + +template +void Pad3DNCDHW(const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T value, + T* out_data, + void (*pad_func)(const T*, + T*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const T)) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_d = 0; out_d < out_depth; ++out_d) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + pad_func(in_data, + out_data, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_d, + out_h, + out_w, + value); + } + } + } + in_data += in_depth * in_height * in_width; + out_data += out_depth * out_height * out_width; + } + } +} + +template +void Pad3DNDHWC(const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T value, + T* out_data, + void (*pad_func)(const T*, + T*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const T)) { + for (int n = 0; n < num; ++n) { + for (int out_d = 0; out_d < out_depth; ++out_d) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + pad_func(in_data, + out_data, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_d, + out_h, + out_w, + value); + } + } + } + in_data += in_depth * in_height * in_width * channels; + out_data += out_depth * out_height * out_width * channels; + } +} + +template +void Pad3dKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* out) { + T value = static_cast(pad_value); + std::vector pads = paddings.GetData(); + + auto in_dims = x.dims(); + const T* in_data = x.data(); + + if (data_format == "NCDHW") { + out->Resize({in_dims[0], + in_dims[1], + in_dims[2] + pads[4] + pads[5], + in_dims[3] + pads[2] + pads[3], + in_dims[4] + pads[0] + pads[1]}); + } else { + out->Resize({in_dims[0], + in_dims[1] + pads[4] + pads[5], + in_dims[2] + pads[2] + pads[3], + in_dims[3] + pads[0] + pads[1], + in_dims[4]}); + } + + auto out_dims = out->dims(); + T* out_data = dev_ctx.template Alloc(out); + + int channels = in_dims[1]; + int in_depth = in_dims[2]; + int in_height = in_dims[3]; + int in_width = in_dims[4]; + int out_depth = out_dims[2]; + int out_height = out_dims[3]; + int out_width = out_dims[4]; + if (data_format == "NDHWC") { + channels = in_dims[4]; + in_depth = in_dims[1]; + in_height = in_dims[2]; + in_width = in_dims[3]; + out_depth = out_dims[1]; + out_height = out_dims[2]; + out_width = out_dims[3]; + } + + if (mode == "reflect") { + PADDLE_ENFORCE_GT( + in_depth, + pads[4], + errors::InvalidArgument("The depth of Input(X)'s dimension should be " + "greater than pad_front" + " in reflect mode" + ", but received depth(%d) and pad_front(%d).", + in_depth, + pads[4])); + PADDLE_ENFORCE_GT( + in_depth, + pads[5], + errors::InvalidArgument("The depth of Input(X)'s dimension should be " + "greater than pad_back" + " in reflect mode" + ", but received depth(%d) and pad_back(%d).", + in_depth, + pads[5])); + + PADDLE_ENFORCE_GT( + in_height, + pads[2], + errors::InvalidArgument("The height of Input(X)'s dimension should be " + "greater than pad_top" + " in reflect mode" + ", but received depth(%d) and pad_top(%d).", + in_height, + pads[2])); + PADDLE_ENFORCE_GT( + in_height, + pads[3], + errors::InvalidArgument("The height of Input(X)'s dimension should be " + "greater than pad_bottom" + " in reflect mode" + ", but received depth(%d) and pad_bottom(%d).", + in_height, + pads[3])); + + PADDLE_ENFORCE_GT( + in_width, + pads[0], + errors::InvalidArgument("The width of Input(X)'s dimension should be " + "greater than pad_left" + " in reflect mode" + ", but received depth(%d) and pad_left(%d).", + in_width, + pads[0])); + PADDLE_ENFORCE_GT( + in_width, + pads[1], + errors::InvalidArgument("The width of Input(X)'s dimension should be " + "greater than pad_right" + " in reflect mode" + ", but received depth(%d) and pad_right(%d).", + in_width, + pads[1])); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, + 0, + errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); + } + + const int pad_left = pads[0]; + const int pad_top = pads[2]; + const int pad_front = pads[4]; + const int num = in_dims[0]; + if (data_format == "NCDHW") { + std::map + func_map; + + func_map["reflect"] = ReflectPad3DFuncNCDHW; + func_map["replicate"] = ReplicatePad3DFuncNCDHW; + func_map["circular"] = CircularPad3DFuncNCDHW; + func_map["constant"] = ConstPad3DFuncNCDHW; + Pad3DNCDHW(in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + value, + out_data, + func_map[mode]); + } else { + std::map + func_map; + + func_map["reflect"] = ReflectPad3DFuncNDHWC; + func_map["replicate"] = ReplicatePad3DFuncNDHWC; + func_map["circular"] = CircularPad3DFuncNDHWC; + func_map["constant"] = ConstPad3DFuncNDHWC; + Pad3DNDHWC(in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + value, + out_data, + func_map[mode]); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + pad3d, CPU, ALL_LAYOUT, phi::Pad3dKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/frobenius_norm_grad_kernel.h b/paddle/phi/kernels/frobenius_norm_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..edf3aed8b84934e8d1cb6afbfed633c9c58d0890 --- /dev/null +++ b/paddle/phi/kernels/frobenius_norm_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FrobeniusNormGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* dx); +} // namespace phi diff --git a/paddle/phi/kernels/frobenius_norm_kernel.h b/paddle/phi/kernels/frobenius_norm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f5f37ee0c0fa5f0a4b32f032a02c1671386b909b --- /dev/null +++ b/paddle/phi/kernels/frobenius_norm_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FrobeniusNormKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index b793afb63b1dca9bbd8ad09b83461567de6371ad..9bf1bfecabbf22f5fdc87d9c7426ec7525ac1046 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -17,11 +17,39 @@ namespace phi { namespace funcs { -//////// Sum Functor /////// -struct SumFunctor { +//////// Frobenius Norm Functor /////// +struct FrobeniusNormFunctor { template void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->sum(dim); + y->device(place) = ((x->square()).sum(dim)).sqrt(); + } +}; + +struct FrobeniusNormGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = y->broadcast(dim); + dx->device(place) = *dx + dx->constant(1e-12f); + dx->device(place) = (*x / *dx) * (dy->broadcast(dim)); + } +}; + +//////// Max Functor /////// +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); } }; @@ -41,11 +69,11 @@ struct ProdFunctor { } }; -//////// Max Functor /////// -struct MaxFunctor { +//////// Sum Functor /////// +struct SumFunctor { template void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->maximum(dim); + y->device(place) = x->sum(dim); } }; diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h index 0a50dceb0a00758b2b0ad5f92219812083cb5f24..38b127541650be9d63c840dfd209217a3be4c936 100644 --- a/paddle/phi/kernels/funcs/slice.h +++ b/paddle/phi/kernels/funcs/slice.h @@ -123,5 +123,56 @@ DenseTensor Slice(const Context& dev_ctx, return ret; } +// Use in conv_transpose kernel +template +static void Slice(const Context& ctx, + const DenseTensor* input, + DenseTensor* out, + const std::vector& begin_vec, + const std::vector& end_vec, + const std::vector& axes_vec) { + auto& place = *ctx.eigen_device(); + auto in_dims = input->dims(); + auto offsets = Eigen::DSizes(); + auto extents = Eigen::DSizes(); + for (size_t i = 0; i < D; ++i) { + offsets[i] = 0; + extents[i] = in_dims[i]; + } + + std::vector out_shape_vec = vectorize(in_dims); + for (size_t i = 0; i < axes_vec.size(); ++i) { + offsets[axes_vec[i]] = begin_vec[i]; + extents[axes_vec[i]] = end_vec[i] - begin_vec[i]; + out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i]; + } + + DDim out_dims(make_ddim(out_shape_vec)); + out->Resize(out_dims); + ctx.template Alloc(out); + + auto in_t = + EigenTensor::From(*input); + auto out_t = EigenTensor::From( + *out, out_dims); + + funcs::EigenSlice, T, D>::Eval( + place, out_t, in_t, offsets, extents); + out->Resize(out_dims); +} + +template +static void Slice(const Context& ctx, + const DenseTensor* input, + DenseTensor* out, + int64_t begin_idx, + int64_t end_idx, + int64_t axes) { + std::vector begin_vec = {begin_idx}; + std::vector end_vec = {end_idx}; + std::vector axes_vec = {axes}; + Slice(ctx, input, out, begin_vec, end_vec, axes_vec); +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e583e13650aebf2792014a9fb9e46ac12916af61 --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu @@ -0,0 +1,157 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_transpose_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h" + +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +namespace phi { + +template +void Conv2dTransposeDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const DenseTensor& ddx, + const DenseTensor& ddfilter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter, + DenseTensor* ddout) { + ConvTransposeGradRawKernel(ctx, + x, + filter, + dout, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + dx, + dfilter); +} + +template +void DepthwiseConv2dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + DenseTensor filter_ = filter; + + if (!dx && !dfilter) { + return; + } + + std::vector paddings_ = paddings; + std::vector dilations_ = dilations; + + auto x_dims = x.dims(); + auto filter_dims = filter_.dims(); + + DDim in_data_dims; + if (data_layout != DataLayout::kNHWC) { + in_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } else { + in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize); + + if (dx) { + paddle::operators::math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv(ctx, + dout, + filter_, + strides, + std::vector{ + paddings_[0], paddings_[2], paddings_[1], paddings_[3]}, + dilations_, + dx, + data_layout); + } + + if (dfilter) { + funcs::SetConstant set_zero; + ctx.template Alloc(dfilter); + set_zero(ctx, dfilter, static_cast(0)); + + paddle::operators::math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad( + ctx, + dout, + x, + strides, + std::vector{ + paddings_[0], paddings_[2], paddings_[1], paddings_[3]}, + dilations_, + dfilter, + data_layout); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_transpose_grad, + GPU, + ALL_LAYOUT, + phi::Conv2dTransposeGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(conv2d_transpose_grad_grad, + GPU, + ALL_LAYOUT, + phi::Conv2dTransposeDoubleGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(conv3d_transpose_grad, + GPU, + ALL_LAYOUT, + phi::Conv3dTransposeGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConv2dTransposeGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b7d34a5baf3df530d003b3475bea8702d1440c77 --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu @@ -0,0 +1,118 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_transpose_kernel.h" +#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h" + +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +namespace phi { + +template +void DepthwiseConv2dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + DenseTensor filter_ = filter; + ctx.template Alloc(out); + + PADDLE_ENFORCE_EQ( + groups, + filter_.dims()[0], + errors::InvalidArgument( + "groups should be error to the 1st dimension of filter_. But " + "received groups is %d and filter dimension[0] is %d", + groups, + filter_.dims()[0])); + + std::vector paddings_ = paddings; + std::vector dilations_ = dilations; + + for (auto v : dilations_) { + PADDLE_ENFORCE_EQ( + v, + 1, + errors::InvalidArgument("dilations should be 1 in depthwise conv. " + "But received dilations is %d", + v)); + } + + auto x_dims = x.dims(); + auto filter_dims = filter_.dims(); + + DDim in_data_dims; + if (data_layout != DataLayout::kNHWC) { + in_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } else { + in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize); + + ctx.template Alloc(out); + + funcs::SetConstant set_zero; + set_zero(ctx, out, static_cast(0)); + + paddle::operators::math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad( + ctx, + *out, + filter, + x, + strides, + std::vector{paddings_[0], paddings_[2], paddings_[1], paddings_[3]}, + dilations_, + out, + data_layout); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_transpose, + GPU, + ALL_LAYOUT, + phi::Conv2dTransposeKernel, + float, + double) {} +PD_REGISTER_KERNEL(conv3d_transpose, + GPU, + ALL_LAYOUT, + phi::Conv3dTransposeKernel, + float, + double) {} +PD_REGISTER_KERNEL(depthwise_conv2d_transpose, + GPU, + ALL_LAYOUT, + phi::DepthwiseConv2dTransposeKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..221bf1cb4c68c7b0ef9b91fa7fb08dd77bcf04da --- /dev/null +++ b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h" +#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(frobenius_norm_grad, + GPU, + ALL_LAYOUT, + phi::FrobeniusNormGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..012237165b739a5698afff6a8922c9ed06bc7265 --- /dev/null +++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/frobenius_norm_kernel.h" +#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu index 0e042089e1e3d0a20bf3811de3633f5fea0584fa..68573d559664693c9948bbf8bfe89df21dc553c8 100644 --- a/paddle/phi/kernels/gpu/index_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -35,7 +35,7 @@ void LimitGridDim(const Context& ctx, dim3* grid_dim) { #define PREDEFINED_BLOCK_SIZE_X 512 #define PREDEFINED_BLOCK_SIZE 1024 #define MIN(a, b) ((a) < (b) ? (a) : (b)) -} +} // namespace template __global__ void IndexSampleForward(const IndexT* index, diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ca8f3d73daded476052b77459bd68f2184ab290 --- /dev/null +++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu @@ -0,0 +1,507 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad3d_grad_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void Pad3DGradConstNCDHW(const int in_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(in_index, in_size) { + const int in_w = in_index % in_width; + + int nc = in_index / in_width; + const int in_h = nc % in_height; + + nc /= in_height; + const int in_d = nc % in_depth; + + nc /= in_depth; + + const int out_d = in_d + pad_front; + const int out_h = in_h + pad_top; + const int out_w = in_w + pad_left; + d_in_data[in_index] = + d_out_data[nc * out_depth * out_height * out_width + + out_d * out_height * out_width + out_h * out_width + out_w]; + } +} + +template +__global__ void Pad3DGradConstNDHWC(const int in_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(in_index, in_size) { + const int c = in_index % channels; + int n = in_index / channels; + + const int in_w = n % in_width; + n /= in_width; + + const int in_h = n % in_height; + n /= in_height; + + const int in_d = n % in_depth; + n /= in_depth; + + const int out_d = in_d + pad_front; + const int out_h = in_h + pad_top; + const int out_w = in_w + pad_left; + + d_in_data[in_index] = + d_out_data[n * out_depth * out_height * out_width * channels + + out_d * out_height * out_width * channels + + out_h * out_width * channels + out_w * channels + c]; + } +} + +template +__global__ void Pad3DGradReflectNCDHW(const int out_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(out_index, out_size) { + int nc = out_index / out_width; + const int out_w = out_index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int out_d = nc % out_depth; + nc /= out_depth; + + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = max(in_d, -in_d); + in_h = max(in_h, -in_h); + in_w = max(in_w, -in_w); + + in_d = min(in_d, 2 * in_depth - in_d - 2); + in_h = min(in_h, 2 * in_height - in_h - 2); + in_w = min(in_w, 2 * in_width - in_w - 2); + + paddle::platform::CudaAtomicAdd( + &d_in_data[nc * in_depth * in_height * in_width + + in_d * in_height * in_width + in_h * in_width + in_w], + d_out_data[out_index]); + } +} + +template +__global__ void Pad3DGradReflectNDHWC(const int out_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(out_index, out_size) { + const int c = out_index % channels; + int n = out_index / channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int out_d = n % out_depth; + n /= out_depth; + + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = max(in_d, -in_d); + in_h = max(in_h, -in_h); + in_w = max(in_w, -in_w); + + in_d = min(in_d, in_depth * 2 - in_d - 2); + in_h = min(in_h, in_height * 2 - in_h - 2); + in_w = min(in_w, in_width * 2 - in_w - 2); + paddle::platform::CudaAtomicAdd( + &d_in_data[n * in_depth * in_height * in_width * channels + + in_d * in_height * in_width * channels + + in_h * in_width * channels + in_w * channels + c], + d_out_data[out_index]); + } +} + +template +__global__ void Pad3DGradReplicateNCDHW(const int out_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(out_index, out_size) { + int nc = out_index / out_width; + const int out_w = out_index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int out_d = nc % out_depth; + nc /= out_depth; + + const int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); + const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + + paddle::platform::CudaAtomicAdd( + &d_in_data[nc * in_depth * in_height * in_width + + in_d * in_height * in_width + in_h * in_width + in_w], + d_out_data[out_index]); + } +} + +template +__global__ void Pad3DGradReplicateNDHWC(const int out_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(out_index, out_size) { + const int c = out_index % channels; + int n = out_index / channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int out_d = n % out_depth; + n /= out_depth; + + const int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); + const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + + paddle::platform::CudaAtomicAdd( + &d_in_data[n * in_depth * in_height * in_width * channels + + in_d * in_height * in_width * channels + + in_h * in_width * channels + in_w * channels + c], + d_out_data[out_index]); + } +} + +template +__global__ void Pad3DGradCircularNCDHW(const int out_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(out_index, out_size) { + int nc = out_index / out_width; + const int out_w = out_index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int out_d = nc % out_depth; + nc /= out_depth; + + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + + paddle::platform::CudaAtomicAdd( + &d_in_data[nc * in_depth * in_height * in_width + + in_d * in_height * in_width + in_h * in_width + in_w], + d_out_data[out_index]); + } +} + +template +__global__ void Pad3DGradCircularNDHWC(const int out_size, + T* d_in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + const T* d_out_data) { + CUDA_KERNEL_LOOP(out_index, out_size) { + const int c = out_index % channels; + int n = out_index / channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int out_d = n % out_depth; + n /= out_depth; + + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + + paddle::platform::CudaAtomicAdd( + &d_in_data[n * in_depth * in_height * in_width * channels + + in_d * in_height * in_width * channels + + in_h * in_width * channels + in_w * channels + c], + d_out_data[out_index]); + } +} + +template +void Pad3dGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* x_grad) { + std::vector pads = paddings.GetData(); + auto* d_out = &out_grad; + auto* d_in = x_grad; + auto d_in_dims = d_in->dims(); + auto d_out_dims = d_out->dims(); + const T* d_out_data = d_out->data(); + T* d_in_data = dev_ctx.template Alloc(d_in); + + phi::funcs::SetConstant()(dev_ctx, d_in, static_cast(0)); + + const int pad_left = pads[0]; + const int pad_top = pads[2]; + const int pad_front = pads[4]; + + const int num = d_in_dims[0]; + + auto stream = dev_ctx.stream(); + int block = PADDLE_CUDA_NUM_THREADS; + const int out_size = d_out->numel(); + const int in_size = d_in->numel(); + int grid = (out_size + block - 1) / block; + + if (data_format == "NCDHW") { + const int channels = d_in_dims[1]; + const int in_depth = d_in_dims[2]; + const int in_height = d_in_dims[3]; + const int in_width = d_in_dims[4]; + const int out_depth = d_out_dims[2]; + const int out_height = d_out_dims[3]; + const int out_width = d_out_dims[4]; + + if (mode == "reflect") { + Pad3DGradReflectNCDHW<<>>(out_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } else if (mode == "replicate") { + Pad3DGradReplicateNCDHW<<>>(out_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } else if (mode == "circular") { + Pad3DGradCircularNCDHW<<>>(out_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } else { + grid = (in_size + block - 1) / block; + Pad3DGradConstNCDHW<<>>(in_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } + } else { + const int channels = d_in_dims[4]; + const int in_depth = d_in_dims[1]; + const int in_height = d_in_dims[2]; + const int in_width = d_in_dims[3]; + const int out_depth = d_out_dims[1]; + const int out_height = d_out_dims[2]; + const int out_width = d_out_dims[3]; + if (mode == "reflect") { + Pad3DGradReflectNDHWC<<>>(out_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } else if (mode == "replicate") { + Pad3DGradReplicateNDHWC<<>>(out_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } else if (mode == "circular") { + Pad3DGradCircularNDHWC<<>>(out_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } else { + grid = (in_size + block - 1) / block; + Pad3DGradConstNDHWC<<>>(in_size, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + d_out_data); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2cef77cc0eef96d910d1b4f8c1b0ba736034063a --- /dev/null +++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu @@ -0,0 +1,588 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad3d_kernel.h" + +#include + +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void Pad3DConstNCDHW(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T value, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int nc = index / out_width; + + const int out_w = index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int out_d = nc % out_depth; + nc /= out_depth; + + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + out_data[index] = + (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || + in_h >= in_height || in_w >= in_width) + ? value + : in_data[nc * in_depth * in_height * in_width + + in_d * in_height * in_width + in_h * in_width + in_w]; + } +} + +template +__global__ void Pad3DConstNDHWC(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T value, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int n = index / channels; + const int c = index % channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int out_d = n % out_depth; + n /= out_depth; + const int in_d = out_d - pad_front; + const int in_h = out_h - pad_top; + const int in_w = out_w - pad_left; + + out_data[index] = + (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || + in_h >= in_height || in_w >= in_width) + ? value + : in_data[n * in_depth * in_height * in_width * channels + + in_d * in_height * in_width * channels + + in_h * in_width * channels + in_w * channels + c]; + } +} + +template +__global__ void Pad3DReflectNCDHW(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int nc = index / out_width; + + const int out_w = index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int out_d = nc % out_depth; + nc /= out_depth; + + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = max(in_d, -in_d); // reflect by 0 + in_d = min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth + in_h = max(in_h, -in_h); // reflect by 0 + in_h = min(in_h, 2 * in_height - in_h - 2); // reflect by in_height + in_w = max(in_w, -in_w); // reflect by 0 + in_w = min(in_w, 2 * in_width - in_w - 2); // reflect by in_width + out_data[index] = + in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * + in_width + + in_w]; + } +} + +template +__global__ void Pad3DReflectNDHWC(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int n = index / channels; + const int c = index % channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int out_d = n % out_depth; + n /= out_depth; + int in_d = out_d - pad_front; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + + in_d = max(in_d, -in_d); + in_d = min(in_d, 2 * in_depth - in_d - 2); + in_h = max(in_h, -in_h); + in_h = min(in_h, 2 * in_height - in_h - 2); + in_w = max(in_w, -in_w); + in_w = min(in_w, 2 * in_width - in_w - 2); + + out_data[index] = in_data[n * in_depth * in_height * in_width * channels + + in_d * in_height * in_width * channels + + in_h * in_width * channels + in_w * channels + c]; + } +} + +template +__global__ void Pad3DReplicateNCDHW(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int nc = index / out_width; + + const int out_w = index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int out_d = nc % out_depth; + nc /= out_depth; + + int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); + int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + + out_data[index] = + in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * + in_width + + in_w]; + } +} + +template +__global__ void Pad3DReplicateNDHWC(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int n = index / channels; + const int c = index % channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int out_d = n % out_depth; + n /= out_depth; + + int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); + int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + + out_data[index] = in_data[n * in_depth * in_height * in_width * channels + + in_d * in_height * in_width * channels + + in_h * in_width * channels + in_w * channels + c]; + } +} + +template +__global__ void Pad3DCircularNCDHW(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int nc = index / out_width; + + const int out_w = index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int out_d = nc % out_depth; + nc /= out_depth; + + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + + out_data[index] = + in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * + in_width + + in_w]; + } +} + +template +__global__ void Pad3DCircularNDHWC(const int nthreads, + const T* in_data, + const int num, + const int channels, + const int in_depth, + const int in_height, + const int in_width, + const int out_depth, + const int out_height, + const int out_width, + const int pad_front, + const int pad_top, + const int pad_left, + T* out_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + int n = index / channels; + const int c = index % channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int out_d = n % out_depth; + n /= out_depth; + + int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; + int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; + int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; + + out_data[index] = in_data[n * in_depth * in_height * in_width * channels + + in_d * in_height * in_width * channels + + in_h * in_width * channels + in_w * channels + c]; + } +} + +template +void Pad3dKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* out) { + std::vector pads = paddings.GetData(); + + auto in_dims = x.dims(); + const T* in_data = x.data(); + auto out_dims = out->dims(); + T value = static_cast(pad_value); + + if (data_format == "NCDHW") { + out_dims[0] = in_dims[0]; + out_dims[1] = in_dims[1]; + out_dims[2] = in_dims[2] + pads[4] + pads[5]; + out_dims[3] = in_dims[3] + pads[2] + pads[3]; + out_dims[4] = in_dims[4] + pads[0] + pads[1]; + } else { + out_dims[0] = in_dims[0]; + out_dims[1] = in_dims[1] + pads[4] + pads[5]; + out_dims[2] = in_dims[2] + pads[2] + pads[3]; + out_dims[3] = in_dims[3] + pads[0] + pads[1]; + out_dims[4] = in_dims[4]; + } + out->Resize(out_dims); + T* out_data = dev_ctx.template Alloc(out); + + int channels = in_dims[1]; + int in_depth = in_dims[2]; + int in_height = in_dims[3]; + int in_width = in_dims[4]; + int out_depth = out_dims[2]; + int out_height = out_dims[3]; + int out_width = out_dims[4]; + if (data_format == "NDHWC") { + channels = in_dims[4]; + in_depth = in_dims[1]; + in_height = in_dims[2]; + in_width = in_dims[3]; + out_depth = out_dims[1]; + out_height = out_dims[2]; + out_width = out_dims[3]; + } + + if (mode == "reflect") { + PADDLE_ENFORCE_GT( + in_depth, + pads[4], + errors::InvalidArgument("The depth of Input(X)'s dimension should be " + "greater than pad_front" + " in reflect mode" + ", but received depth(%d) and pad_front(%d).", + in_depth, + pads[4])); + PADDLE_ENFORCE_GT( + in_depth, + pads[5], + errors::InvalidArgument("The depth of Input(X)'s dimension should be " + "greater than pad_back" + " in reflect mode" + ", but received depth(%d) and pad_back(%d).", + in_depth, + pads[5])); + + PADDLE_ENFORCE_GT( + in_height, + pads[2], + errors::InvalidArgument("The height of Input(X)'s dimension should be " + "greater than pad_top" + " in reflect mode" + ", but received depth(%d) and pad_top(%d).", + in_height, + pads[2])); + PADDLE_ENFORCE_GT( + in_height, + pads[3], + errors::InvalidArgument("The height of Input(X)'s dimension should be " + "greater than pad_bottom" + " in reflect mode" + ", but received depth(%d) and pad_bottom(%d).", + in_height, + pads[3])); + + PADDLE_ENFORCE_GT( + in_width, + pads[0], + errors::InvalidArgument("The width of Input(X)'s dimension should be " + "greater than pad_left" + " in reflect mode" + ", but received depth(%d) and pad_left(%d).", + in_width, + pads[0])); + PADDLE_ENFORCE_GT( + in_width, + pads[1], + errors::InvalidArgument("The width of Input(X)'s dimension should be " + "greater than pad_right" + " in reflect mode" + ", but received depth(%d) and pad_right(%d).", + in_width, + pads[1])); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, + 0, + errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); + } + + const int pad_left = pads[0]; + const int pad_top = pads[2]; + const int pad_front = pads[4]; + const int num = in_dims[0]; + + auto stream = dev_ctx.stream(); + int block = PADDLE_CUDA_NUM_THREADS; + const int out_size = out->numel(); + int grid = (out_size + block - 1) / block; + + if (data_format == "NCDHW") { + if (mode == "reflect") { + Pad3DReflectNCDHW<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_data); + } else if (mode == "replicate") { + Pad3DReplicateNCDHW<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_data); + } else if (mode == "circular") { + Pad3DCircularNCDHW<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_data); + } else { + Pad3DConstNCDHW<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + value, + out_data); + } + } else { + if (mode == "reflect") { + Pad3DReflectNDHWC<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_data); + } else if (mode == "replicate") { + Pad3DReplicateNDHWC<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_data); + } else if (mode == "circular") { + Pad3DCircularNDHWC<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + out_data); + } else { + Pad3DConstNDHWC<<>>(out_size, + in_data, + num, + channels, + in_depth, + in_height, + in_width, + out_depth, + out_height, + out_width, + pad_front, + pad_top, + pad_left, + value, + out_data); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(pad3d, + GPU, + ALL_LAYOUT, + phi::Pad3dKernel, + phi::dtype::float16, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2893bd74b1bce691ad9b9e3333e6afbf2a2850fd --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -0,0 +1,1122 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/conv_transpose_grad_kernel.h" + +#include +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/padding.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" +#endif + +namespace phi { + +using GPUDNNDataLayout = paddle::platform::DataLayout; + +template +void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + const T* filter_data = filter.data(); + std::vector paddings_ = paddings; + std::vector dilations_ = + dilations; // cudnn v5 does not support dilations + const GPUDNNDataLayout data_layout = + (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW + : GPUDNNDataLayout::kNHWC); + + // if channel_last, transpose to channel_first + DenseTensor x_transpose; + DenseTensor dout_transpose; + std::vector x_vec = vectorize(x.dims()); + std::vector out_vec = vectorize(dout.dims()); + if (data_layout == GPUDNNDataLayout::kNHWC) { + if (strides.size() == 2U) { + std::vector axis = {0, 3, 1, 2}; + for (size_t i = 0; i < axis.size(); ++i) { + x_vec[i] = x.dims()[axis[i]]; + out_vec[i] = dout.dims()[axis[i]]; + } + x_transpose = Transpose(ctx, x, axis); + dout_transpose = Transpose(ctx, dout, axis); + } else if (strides.size() == 3U) { + std::vector axis = {0, 4, 1, 2, 3}; + for (size_t i = 0; i < axis.size(); ++i) { + x_vec[i] = x.dims()[axis[i]]; + out_vec[i] = dout.dims()[axis[i]]; + } + x_transpose = Transpose(ctx, x, axis); + dout_transpose = Transpose(ctx, dout, axis); + } + } else { + x_transpose = x; + dout_transpose = dout; + } + + // update padding and dilation + auto x_dims = x_transpose.dims(); + auto filter_dims = filter.dims(); + DDim x_data_dims; + x_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim); + + std::vector x_pad(x_dims.size() * 2, 0); + DenseTensor transformed_dout; + std::vector padding_common(data_dim, 0); + if (!is_sys_pad) { + std::vector padding_diff(data_dim); + std::vector new_dout_shape_vec(data_dim + 2); + new_dout_shape_vec[0] = dout_transpose.dims()[0]; + new_dout_shape_vec[1] = dout_transpose.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]); + padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]); + new_dout_shape_vec[i + 2] = + dout_transpose.dims()[i + 2] + padding_diff[i]; + x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i]; + x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i]; + } + + transformed_dout.Resize(make_ddim(new_dout_shape_vec)); + ctx.template Alloc(&transformed_dout); + + const int rank = x_transpose.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, x_pad, dout_transpose, pad_value, &transformed_dout); + } break; + case 5: { + funcs::PadFunction( + ctx, x_pad, dout_transpose, pad_value, &transformed_dout); + } break; + default: + PADDLE_THROW(errors::InvalidArgument( + "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor.")); + } + } else { + transformed_dout = dout_transpose; + if (paddings_.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[2 * i]; + } + } + } + + const T* x_data = x_transpose.data(); + const T* dout_data = transformed_dout.data(); + out_vec = vectorize(transformed_dout.dims()); + + // ------------------- cudnn descriptors --------------------- + GPUDNNDataLayout layout; + + if (strides.size() == 2U) { + layout = GPUDNNDataLayout::kNCHW; + } else { + layout = GPUDNNDataLayout::kNCDHW; + } + + int iwo_groups = groups; + int c_groups = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + auto dtype = paddle::platform::CudnnDataType::type; + + paddle::operators::ConvArgs args1{&transformed_dout, + &filter, + &x_transpose, + strides, + padding_common, + dilations_, + dtype}; + paddle::operators::ConvArgs args2{&transformed_dout, + &filter, + &x_transpose, + strides, + padding_common, + dilations_, + dtype}; + +#ifdef PADDLE_WITH_HIP + miopenConvFwdAlgorithm_t data_algo{}; + miopenConvBwdWeightsAlgorithm_t filter_algo{}; +#else + cudnnConvolutionFwdAlgo_t data_algo{}; + cudnnConvolutionBwdFilterAlgo_t filter_algo{}; +#endif + + auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); + size_t workspace_size = 0; + auto handle = ctx.cudnn_handle(); + bool deterministic = FLAGS_cudnn_deterministic; + T* dx_data = nullptr; + T* dfilter_data = nullptr; + + if (dx) { + dx_data = ctx.template Alloc(dx); + args1.handle = handle; + args1.idesc.set(transformed_dout, iwo_groups); + args1.wdesc.set(filter, layout_tensor, iwo_groups); + args1.odesc.set(x_transpose, iwo_groups); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations_, + paddle::platform::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); + data_algo = + search1::Find(args1, false, deterministic, workspace_size, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + data_algo = search1::Find(args1, false, deterministic, ctx); + workspace_size = + std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); +#endif + } + + if (dfilter) { + dfilter_data = ctx.template Alloc(dfilter); + args2.handle = handle; + args2.idesc.set(transformed_dout, iwo_groups); + args2.wdesc.set(*dfilter, layout_tensor, iwo_groups); + args2.odesc.set(x_transpose, iwo_groups); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations_, + paddle::platform::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); + filter_algo = + search2::Find(args2, false, deterministic, workspace_size, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + filter_algo = search2::Find(args2, false, deterministic, ctx); + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); +#endif + } + + // ------------------- cudnn conv backward data --------------------- + // FIxME(typhoonzero): template type T may not be the same as cudnn call. + int x_offset = x.numel() / x.dims()[0] / groups; + int dout_offset = + transformed_dout.numel() / transformed_dout.dims()[0] / groups; + int filter_offset = filter.numel() / groups; + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + auto workspace_handle = ctx.cudnn_workspace_handle(); + if (dx) { + // Because beta is zero, it is unnecessary to reset dx. + for (int g = 0; g < groups; g++) { +#ifdef PADDLE_WITH_HIP + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenConvolutionForward(handle, + &alpha, + args1.idesc.desc(), + dout_data + dout_offset * g, + args1.wdesc.desc(), + filter_data + filter_offset * g, + args1.cdesc.desc(), + data_algo, + &beta, + args1.odesc.desc(), + dx_data + x_offset * g, + cudnn_workspace, + workspace_size)); + }; +#else // PADDLE_WITH_HIP + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cudnnConvolutionForward(handle, + &alpha, + args1.idesc.desc(), + dout_data + dout_offset * g, + args1.wdesc.desc(), + filter_data + filter_offset * g, + args1.cdesc.desc(), + data_algo, + cudnn_workspace, + workspace_size, + &beta, + args1.odesc.desc(), + dx_data + x_offset * g)); + }; +#endif // PADDLE_WITH_HIP + workspace_handle.RunFunc(cudnn_func, workspace_size); + } + + if (data_layout == GPUDNNDataLayout::kNHWC) { + DenseTensor dx_transpose; + DenseTensor dx_nchw; + dx_nchw.ShareDataWith(*dx); + dx_nchw.Resize(make_ddim(x_vec)); + if (strides.size() == 2U) { + std::vector axis = {0, 2, 3, 1}; + dx_transpose = Transpose(ctx, dx_nchw, axis); + *dx = dx_transpose; + } else if (strides.size() == 3U) { + std::vector axis = {0, 2, 3, 4, 1}; + dx_transpose = Transpose(ctx, dx_nchw, axis); + *dx = dx_transpose; + } + } + } + + // ------------------- cudnn conv backward filter --------------------- + if (dfilter) { + // Because beta is zero, it is unnecessary to reset dfilter. + // Gradient with respect to the filter + for (int g = 0; g < groups; g++) { +#ifdef PADDLE_WITH_HIP + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args2.odesc.desc(), + x_data + x_offset * g, + args2.idesc.desc(), + dout_data + dout_offset * g, + args2.cdesc.desc(), + filter_algo, + &beta, + args2.wdesc.desc(), + dfilter_data + filter_offset * g, + cudnn_workspace, + workspace_size)); + }; +#else // PADDLE_WITH_HIP + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args2.idesc.desc(), + dout_data + dout_offset * g, + args2.odesc.desc(), + x_data + x_offset * g, + args2.cdesc.desc(), + filter_algo, + cudnn_workspace, + workspace_size, + &beta, + args2.wdesc.desc(), + dfilter_data + filter_offset * g)); + }; +#endif // PADDLE_WITH_HIP + workspace_handle.RunFunc(cudnn_func, workspace_size); + } + } +} + +template +void Conv2dTransposeGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings_, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + ConvTransposeGradRawGPUDNNKernel(ctx, + x, + filter, + dout, + strides, + paddings_, + padding_algorithm, + groups, + dilations_, + data_format, + dx, + dfilter); +} + +/* + * Inputs: I, filter, dout, ddI, ddfilter + * Outputs: ddout, dfilter, dI + * ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I) + * dfilter = conv_bp_filter(dout, ddI) + * dI = conv(dout, ddfilter) + */ +template +void Conv2dTransposeDoubleGradGPUDNNKernel( + const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const DenseTensor& ddx, + const DenseTensor& ddfilter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter, + DenseTensor* ddout) { + if (dx) { + ctx.template Alloc(dx); + } + if (dfilter) { + ctx.template Alloc(dfilter); + } + if (ddout) { + ctx.template Alloc(ddout); + funcs::SetConstant set_zero; + set_zero(ctx, ddout, static_cast(0)); + } + + const T* filter_ = filter.data(); + const T* dout_ = dout.data(); + const T* ddx_ = nullptr; + const T* ddfilter_ = nullptr; + T* dx_ = nullptr; + T* dfilter_ = nullptr; + T* ddout_ = nullptr; + T* transformed_dx_ = nullptr; + + std::vector paddings_ = paddings; + std::vector dilations_ = dilations; + + bool deterministic = FLAGS_cudnn_deterministic; + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform DenseTensors to channel first----------- + DenseTensor transformed_x_channel(x.type()); + DenseTensor transformed_dout_channel(dout.type()); + DenseTensor transformed_ddx_channel(x.type()); + + DenseTensor transformed_dx_channel(x.type()); + DenseTensor transformed_ddout_channel(dout.type()); + + if (channel_last) { + ResizeToChannelFirst(ctx, &x, &transformed_x_channel); + TransToChannelFirst(ctx, &x, &transformed_x_channel); + + ResizeToChannelFirst(ctx, &dout, &transformed_dout_channel); + TransToChannelFirst(ctx, &dout, &transformed_dout_channel); + + ResizeToChannelFirst(ctx, &ddx, &transformed_ddx_channel); + TransToChannelFirst(ctx, &ddx, &transformed_ddx_channel); + + if (dx) { + ResizeToChannelFirst(ctx, dx, &transformed_dx_channel); + ctx.template Alloc(&transformed_dx_channel); + } + if (ddout) { + ResizeToChannelFirst(ctx, ddout, &transformed_ddout_channel); + } + } else { + transformed_x_channel = x; + transformed_dout_channel = dout; + transformed_ddx_channel = ddx; + + if (dx) { + transformed_dx_channel = *dx; + } + } + std::vector out_vec = vectorize(transformed_dout_channel.dims()); + + auto x_dims = transformed_x_channel.dims(); + auto filter_dims = filter.dims(); + DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim); + DenseTensor transformed_x(x.type()); + DenseTensor transformed_ddx(x.type()); + + DenseTensor transformed_dout(dout.type()); + + std::vector padding_common(data_dim, 0); + std::vector input_pad(x.dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + std::vector new_output_grad_shape_vec(data_dim + 2); + + new_input_shape_vec[0] = transformed_x_channel.dims()[0]; + new_input_shape_vec[1] = transformed_x_channel.dims()[1]; + + new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0]; + new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]); + padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]); + new_input_shape_vec[i + 2] = + transformed_x_channel.dims()[i + 2] + padding_diff[i]; + + new_output_grad_shape_vec[i + 2] = + transformed_dout_channel.dims()[i + 2] + padding_diff[i]; + + input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i]; + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_x.Resize(new_input_shape); + transformed_ddx.Resize(new_input_shape); + transformed_dout.Resize(make_ddim(new_output_grad_shape_vec)); + + ctx.template Alloc(&transformed_x); + ctx.template Alloc(&transformed_ddx); + ctx.template Alloc(&transformed_dout); + + // pad for input + const int rank = x.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, input_pad, transformed_x_channel, pad_value, &transformed_x); + funcs::PadFunction(ctx, + input_pad, + transformed_dout_channel, + pad_value, + &transformed_dout); + funcs::PadFunction(ctx, + input_pad, + transformed_ddx_channel, + pad_value, + &transformed_ddx); + } break; + case 5: { + funcs::PadFunction( + ctx, input_pad, transformed_x_channel, pad_value, &transformed_x); + funcs::PadFunction(ctx, + input_pad, + transformed_ddx_channel, + pad_value, + &transformed_ddx); + } break; + default: + PADDLE_THROW(errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + } else { + transformed_x = transformed_x_channel; + transformed_dout = transformed_dout_channel; + transformed_ddx = transformed_ddx_channel; + + if (paddings_.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[2 * i]; + } + } + } + + std::vector starts(data_dim, 0); + std::vector ends(data_dim, 0); + std::vector axes(data_dim, 0); + for (size_t i = 0; i < data_dim; ++i) { + starts[i] = input_pad[2 * i + 4] * (strides[i] + 1); + ends[i] = starts[i] + out_vec[i + 2]; + axes[i] = i + 2; + } + + std::vector transformed_out_vec = out_vec; + for (size_t i = 0; i < data_dim; ++i) { + transformed_out_vec[i + 2] = + out_vec[i + 2] + + (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] - + 2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1]; + } + + if (!is_sys_pad) { + transformed_ddout_channel.Resize(make_ddim(transformed_out_vec)); + ctx.template Alloc(&transformed_ddout_channel); + } else { + ctx.template Alloc(ddout); + transformed_ddout_channel = *ddout; + transformed_ddout_channel.Resize(make_ddim(transformed_out_vec)); + } + + const T* x_ = transformed_x.data(); + + int iwo_group = groups; + int c_group = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; + groups = 1; +#endif + auto dtype = paddle::platform::CudnnDataType::type; + + auto handle = ctx.cudnn_handle(); + + paddle::operators::ConvArgs args1{&transformed_ddout_channel, + &filter, + &transformed_ddx, + strides, + padding_common, + dilations_, + dtype}; + paddle::operators::ConvArgs args2{&transformed_ddout_channel, + &ddfilter, + &transformed_x, + strides, + padding_common, + dilations_, + dtype}; + + paddle::operators::ConvArgs args3{&transformed_dout, + dfilter, + &transformed_ddx_channel, + strides, + padding_common, + dilations_, + dtype}; + paddle::operators::ConvArgs args4{&transformed_dout, + &ddfilter, + &transformed_dx_channel, + strides, + padding_common, + dilations_, + dtype}; +#ifdef PADDLE_WITH_HIP + miopenConvBwdDataAlgorithm_t bwd_algo1 = + static_cast(0); + miopenConvBwdDataAlgorithm_t bwd_algo2 = + static_cast(0); + miopenConvFwdAlgorithm_t data_algo = static_cast(0); + miopenConvBwdWeightsAlgorithm_t filter_algo = + static_cast(0); +#else + cudnnConvolutionBwdDataAlgo_t bwd_algo1 = + static_cast(0); + cudnnConvolutionBwdDataAlgo_t bwd_algo2 = + static_cast(0); + cudnnConvolutionFwdAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); +#endif + + auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); + + // ddo = conv(ddI, filter) + conv(I, ddfilter) + size_t workspace_size = 0; + + T* transformed_ddout_channel_ = nullptr; + + if (ddout) { + ddout_ = ddout->data(); + transformed_ddout_channel_ = transformed_ddout_channel.data(); + + args1.handle = handle; + args1.idesc.set(transformed_ddout_channel, iwo_group); + args1.wdesc.set(filter, layout, iwo_group); + args1.odesc.set(transformed_ddx, iwo_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations_, + paddle::platform::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + bwd_algo1 = + search1::Find(args1, false, deterministic, workspace_size, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + bwd_algo1 = search1::Find(args1, false, deterministic, ctx); + workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); +#endif + + ddfilter_ = ddfilter.data(); + args2.handle = handle; + args2.idesc.set(transformed_ddout_channel, iwo_group); + args2.wdesc.set(ddfilter, layout, iwo_group); + args2.odesc.set(transformed_x, iwo_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations_, + paddle::platform::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); + bwd_algo2 = + search2::Find(args2, false, deterministic, workspace_size, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + bwd_algo2 = search2::Find(args2, false, deterministic, ctx); + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); +#endif + } + + if (dfilter) { + dfilter_ = dfilter->data(); + args3.handle = handle; + args3.idesc.set(transformed_dout, iwo_group); + args3.wdesc.set(*dfilter, layout, iwo_group); + + args3.odesc.set(transformed_ddx_channel, iwo_group); + + args3.cdesc.set(dtype, + padding_common, + strides, + dilations_, + paddle::platform::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search3 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = + search3::Find(args3, false, deterministic, workspace_size, ctx); +#else + using search3 = + paddle::operators::SearchAlgorithm; + filter_algo = search3::Find(args3, false, deterministic, ctx); + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif + } + + if (dx) { + transformed_dx_ = transformed_dx_channel.data(); + + args4.handle = handle; + args4.idesc.set(transformed_dout, iwo_group); + args4.wdesc.set(ddfilter, layout, iwo_group); + args4.odesc.set(transformed_dx_channel, iwo_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations_, + paddle::platform::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search4 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = + search4::Find(args4, false, deterministic, workspace_size, ctx); +#else + using search4 = + paddle::operators::SearchAlgorithm; + data_algo = search4::Find(args4, false, deterministic, ctx); + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif + } + + int i_n, i_c, i_d, i_h, i_w; + paddle::operators::GetNCDHW(transformed_x.dims(), + GPUDNNDataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + + int o_n, o_c, o_d, o_h, o_w; + paddle::operators::GetNCDHW(transformed_dout.dims(), + GPUDNNDataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + + int group_offset_in = + transformed_x.numel() / transformed_x.dims()[0] / groups; + int group_offset_out = + transformed_dout.numel() / transformed_dout.dims()[0] / groups; + int group_offset_filter = filter.numel() / groups; + + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + + auto wkspace_handle = ctx.cudnn_workspace_handle(); + + if (ddout) { + ddx_ = transformed_ddx.data(); + for (int i = 0; i < groups; i++) { +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + ddx_ + i * group_offset_in, + args1.wdesc.desc(), + filter_ + i * group_offset_filter, + args1.cdesc.desc(), + bwd_algo1, + &beta, + args1.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else // PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args1.wdesc.desc(), + filter_ + i * group_offset_filter, + args1.odesc.desc(), + ddx_ + i * group_offset_in, + args1.cdesc.desc(), + bwd_algo1, + workspace_ptr, + workspace_size, + &beta, + args1.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out)); + }, + workspace_size); +#endif // PADDLE_WITH_HIP + } + + for (int i = 0; i < groups; i++) { +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + DenseTensor conv_x_ddfilter(dout.type()); + conv_x_ddfilter.Resize(transformed_ddout_channel.dims()); + T* conv_x_ddfilter_data = ctx.template Alloc(&conv_x_ddfilter); + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args2.odesc.desc(), + x_ + i * group_offset_in, + args2.wdesc.desc(), + ddfilter_ + i * group_offset_filter, + args2.cdesc.desc(), + bwd_algo2, + &beta, + args2.idesc.desc(), + conv_x_ddfilter_data + i * group_offset_out, + workspace_ptr, + workspace_size)); + }, + workspace_size); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor( + handle, + miopenTensorOpAdd, + &alpha, + args2.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out, + &alpha, + args2.idesc.desc(), + conv_x_ddfilter_data + i * group_offset_out, + &beta, + args2.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out)); +#else // PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args2.wdesc.desc(), + ddfilter_ + i * group_offset_filter, + args2.odesc.desc(), + x_ + i * group_offset_in, + args2.cdesc.desc(), + bwd_algo2, + workspace_ptr, + workspace_size, + &alpha, + args2.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out)); + }, + workspace_size); +#endif // PADDLE_WITH_HIP + } + + if ((!is_sys_pad) && (!channel_last)) { + if (strides.size() == 2U) { + funcs::Slice( + ctx, &transformed_ddout_channel, ddout, starts, ends, axes); + } else if (!is_sys_pad && strides.size() == 3U) { + funcs::Slice( + ctx, &transformed_ddout_channel, ddout, starts, ends, axes); + } + } else if ((!is_sys_pad) && (channel_last)) { + if (strides.size() == 2U) { + funcs::Slice(ctx, + &transformed_ddout_channel, + &transformed_ddout_channel, + starts, + ends, + axes); + } else if (!is_sys_pad && strides.size() == 3U) { + funcs::Slice(ctx, + &transformed_ddout_channel, + &transformed_ddout_channel, + starts, + ends, + axes); + } + + TransToChannelLast(ctx, &transformed_ddout_channel, ddout); + } + } + + T* transformed_dout_channel_ = transformed_dout.data(); + if (dfilter) { + ddx_ = transformed_ddx_channel.data(); + for (int i = 0; i < groups; i++) { +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args3.odesc.desc(), + ddx_ + i * group_offset_in, + args3.idesc.desc(), + transformed_dout_channel_ + i * group_offset_out, + args3.cdesc.desc(), + filter_algo, + &beta, + args3.wdesc.desc(), + dfilter_ + i * group_offset_filter, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else // PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args3.idesc.desc(), + transformed_dout_channel_ + i * group_offset_out, + args3.odesc.desc(), + ddx_ + i * group_offset_in, + args3.cdesc.desc(), + filter_algo, + workspace_ptr, + workspace_size, + &beta, + args3.wdesc.desc(), + dfilter_ + i * group_offset_filter)); + }, + workspace_size); +#endif // PADDLE_WITH_HIP + } + } + + if (dx) { + ddfilter_ = ddfilter.data(); + for (int i = 0; i < groups; i++) { +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward( + handle, + &alpha, + args4.idesc.desc(), + transformed_dout_channel_ + i * group_offset_out, + args4.wdesc.desc(), + ddfilter_ + i * group_offset_filter, + args4.cdesc.desc(), + data_algo, + &beta, + args4.odesc.desc(), + transformed_dx_ + i * group_offset_in, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else // PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward( + handle, + &alpha, + args4.idesc.desc(), + transformed_dout_channel_ + i * group_offset_out, + args4.wdesc.desc(), + ddfilter_ + i * group_offset_filter, + args4.cdesc.desc(), + data_algo, + workspace_ptr, + workspace_size, + &beta, + args4.odesc.desc(), + transformed_dx_ + i * group_offset_in)); + }, + workspace_size); +#endif // PADDLE_WITH_HIP + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_dx_channel, dx); + } + } +} + +template +void Conv3dTransposeGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings_, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + ConvTransposeGradRawGPUDNNKernel(ctx, + x, + filter, + dout, + strides, + paddings_, + padding_algorithm, + groups, + dilations_, + data_format, + dx, + dfilter); +} + +} // namespace phi + +using float16 = phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(conv2d_transpose_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv2dTransposeGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(conv2d_transpose_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv2dTransposeDoubleGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(conv3d_transpose_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3dTransposeGradGPUDNNKernel, + float, + float16) {} +#else +PD_REGISTER_KERNEL(conv2d_transpose_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv2dTransposeGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(conv2d_transpose_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv2dTransposeDoubleGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(conv3d_transpose_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3dTransposeGradGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5de2df4a70c88e5ead803493555438ae675cf45e --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -0,0 +1,381 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/conv_transpose_kernel.h" + +#include +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/padding.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" +#endif + +namespace phi { + +using GPUDNNDataLayout = paddle::platform::DataLayout; + +template +void ConvTransposeRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + std::vector paddings_ = paddings; + std::vector dilations_ = + dilations; // cudnn v5 does not support dilations + const T* filter_data = filter.data(); + const GPUDNNDataLayout data_layout = + (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW + : GPUDNNDataLayout::kNHWC); + std::vector x_vec = vectorize(x.dims()); + std::vector out_vec = vectorize(out->dims()); + // if channel_last, transpose to channel_first + DenseTensor x_transpose; + if (data_layout == GPUDNNDataLayout::kNHWC) { + if (strides.size() == 2U) { + std::vector axis = {0, 3, 1, 2}; + for (size_t i = 0; i < axis.size(); ++i) { + x_vec[i] = x.dims()[axis[i]]; + out_vec[i] = out->dims()[axis[i]]; + } + x_transpose = Transpose(ctx, x, axis); + } else if (strides.size() == 3U) { + std::vector axis = {0, 4, 1, 2, 3}; + for (size_t i = 0; i < axis.size(); ++i) { + x_vec[i] = x.dims()[axis[i]]; + out_vec[i] = out->dims()[axis[i]]; + } + x_transpose = Transpose(ctx, x, axis); + } + } else { + x_transpose = x; + } + + // update padding and dilation + auto x_dims = x_transpose.dims(); + auto filter_dims = filter.dims(); + DDim x_data_dims; + x_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim); + + std::vector x_pad(x_dims.size() * 2, 0); + DenseTensor transformed_x; + std::vector padding_common(data_dim, 0); + if (!is_sys_pad) { + std::vector padding_diff(data_dim); + std::vector new_x_shape_vec(data_dim + 2); + new_x_shape_vec[0] = x_dims[0]; + new_x_shape_vec[1] = x_dims[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]); + padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]); + new_x_shape_vec[i + 2] = x_dims[i + 2] + padding_diff[i]; + x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i]; + x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i]; + } + DDim new_x_shape(make_ddim(new_x_shape_vec)); + transformed_x.Resize(new_x_shape); + ctx.template Alloc(&transformed_x); + + const int rank = x_dims.size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, x_pad, x_transpose, pad_value, &transformed_x); + } break; + case 5: { + funcs::PadFunction( + ctx, x_pad, x_transpose, pad_value, &transformed_x); + } break; + default: + PADDLE_THROW(errors::InvalidArgument( + "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor.")); + } + } else { + transformed_x = x_transpose; + if (paddings_.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[2 * i]; + } + } + } + + std::vector starts(data_dim, 0); + std::vector ends(data_dim, 0); + std::vector axes(data_dim, 0); + for (size_t i = 0; i < data_dim; ++i) { + starts[i] = x_pad[2 * i + 4] * (strides[i] + 1); + ends[i] = starts[i] + out_vec[i + 2]; + axes[i] = i + 2; + } + + const T* x_data = transformed_x.data(); + x_vec = vectorize(transformed_x.dims()); + + std::vector transformed_out_vec = out_vec; + for (size_t i = 0; i < data_dim; ++i) { + transformed_out_vec[i + 2] = + out_vec[i + 2] + (x_pad[2 * i + 4] + x_pad[2 * i + 5]) * strides[i] - + 2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1]; + } + + DenseTensor transformed_out; + if (!is_sys_pad) { + transformed_out.Resize(make_ddim(transformed_out_vec)); + ctx.template Alloc(&transformed_out); + } else { + ctx.template Alloc(out); + transformed_out.ShareDataWith(*out); + transformed_out.Resize(make_ddim(transformed_out_vec)); + } + T* transformed_out_data = transformed_out.data(); + + GPUDNNDataLayout layout; + + int iwo_groups = groups; + int c_groups = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + if (strides.size() == 2U) { + layout = GPUDNNDataLayout::kNCHW; + } else { + layout = GPUDNNDataLayout::kNCDHW; + } + + size_t workspace_size = 0; +#ifdef PADDLE_WITH_HIP + miopenConvBwdDataAlgorithm_t algo{}; +#else + cudnnConvolutionBwdDataAlgo_t algo{}; +#endif + // ------------------- cudnn conv algorithm --------------------- + auto handle = ctx.cudnn_handle(); + auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); + bool deterministic = FLAGS_cudnn_deterministic; + + auto dtype = paddle::platform::CudnnDataType::type; + // ------------------- cudnn descriptors --------------------- + paddle::operators::ConvArgs args{&transformed_out, + &filter, + &transformed_x, + strides, + padding_common, + dilations_, + dtype}; + args.handle = handle; + args.idesc.set(transformed_out, iwo_groups); + args.wdesc.set(filter, layout_tensor, iwo_groups); + args.odesc.set(transformed_x, iwo_groups); + args.cdesc.set(dtype, + padding_common, + strides, + dilations_, + paddle::platform::AllowTF32Cudnn(), + c_groups); + +#ifdef PADDLE_WITH_HIP + using search = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); + algo = search::Find(args, false, deterministic, workspace_size, ctx); +#else + using search = + paddle::operators::SearchAlgorithm; + algo = search::Find(args, false, deterministic, ctx); + workspace_size = + std::max(workspace_size, search::GetWorkspaceSize(args, algo)); +#endif + + // ------------------- cudnn conv transpose forward --------------------- + int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups; + int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups; + int filter_offset = filter.numel() / groups; + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + auto workspace_handle = ctx.cudnn_workspace_handle(); + for (int g = 0; g < groups; g++) { +#ifdef PADDLE_WITH_HIP + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args.odesc.desc(), + x_data + x_offset * g, + args.wdesc.desc(), + filter_data + filter_offset * g, + args.cdesc.desc(), + algo, + &beta, + args.idesc.desc(), + transformed_out_data + out_offset * g, + cudnn_workspace, + workspace_size)); + }; +#else // PADDLE_WITH_HIP + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args.wdesc.desc(), + filter_data + filter_offset * g, + args.odesc.desc(), + x_data + x_offset * g, + args.cdesc.desc(), + algo, + cudnn_workspace, + workspace_size, + &beta, + args.idesc.desc(), + transformed_out_data + out_offset * g)); + }; +#endif // PADDLE_WITH_HIP + workspace_handle.RunFunc(cudnn_func, workspace_size); + } + if (!is_sys_pad && strides.size() == 2U) { + funcs::Slice(ctx, &transformed_out, out, starts, ends, axes); + } else if (!is_sys_pad && strides.size() == 3U) { + funcs::Slice(ctx, &transformed_out, out, starts, ends, axes); + } + + if (data_layout == GPUDNNDataLayout::kNHWC) { + DenseTensor out_transpose; + DenseTensor out_nchw; + out_nchw.ShareDataWith(*out); + out_nchw.Resize(make_ddim(out_vec)); + + if (strides.size() == 2U) { + out_transpose = Transpose(ctx, out_nchw, {0, 2, 3, 1}); + } else if (strides.size() == 3U) { + out_transpose = Transpose(ctx, out_nchw, {0, 2, 3, 4, 1}); + } + *out = out_transpose; + } +} + +template +void Conv2dTransposeGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + ConvTransposeRawGPUDNNKernel(ctx, + x, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + +template +void Conv3dTransposeGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + ConvTransposeRawGPUDNNKernel(ctx, + x, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + +} // namespace phi + +using float16 = phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(conv2d_transpose, + GPUDNN, + ALL_LAYOUT, + phi::Conv2dTransposeGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(conv3d_transpose, + GPUDNN, + ALL_LAYOUT, + phi::Conv3dTransposeGPUDNNKernel, + float, + float16) {} +#else +PD_REGISTER_KERNEL(conv2d_transpose, + GPUDNN, + ALL_LAYOUT, + phi::Conv2dTransposeGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(conv3d_transpose, + GPUDNN, + ALL_LAYOUT, + phi::Conv3dTransposeGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..d4fd952a670012900b2152a9236c5c6a2861c6c7 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h @@ -0,0 +1,364 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/conv_transpose_grad_kernel.h" + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/slice.h" + +namespace phi { + +template +void ConvTransposeGradRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + // For filter, we do not use const pointer because we will do reshape, + // but we should avoid modifying its value. + DenseTensor filter_ = filter; + + if ((!dx) && (!dfilter)) { + return; + } + + std::vector paddings_ = paddings; + std::vector dilations_ = dilations; + + auto x_dims = x.dims(); + auto filter_dims = filter_.dims(); + auto dout_dims = dout.dims(); + const int batch_size = static_cast(x.dims()[0]); + + DDim in_data_dims; + if (data_layout != DataLayout::kNHWC) { + in_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } else { + in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize); + + // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first + // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last + std::vector x_shape_vec = vectorize(x.dims()); + // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w} + std::vector filter_shape_vec = vectorize(filter_.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + if (data_layout != DataLayout::kNHWC) { + col_shape_vec[0] = dout_dims[1]; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2]; + } + } else { + col_shape_vec[0] = dout_dims[dout_dims.size() - 1]; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1]; + } + } + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + + // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first + // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last + DDim output_shape = slice_ddim(dout.dims(), 1, dout.dims().size()); + + // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first + // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last + DDim x_matrix_shape; + if (data_layout != DataLayout::kNHWC) { + x_matrix_shape = {x_dims[1], col_matrix_shape[1]}; + } else { + x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]}; + } + + // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w) + DDim filter_matrix_shape; + if (data_layout != DataLayout::kNHWC) { + filter_matrix_shape = {x_dims[1], col_matrix_shape[0] / groups}; + } else { + filter_matrix_shape = {x_dims[x_dims.size() - 1], + col_matrix_shape[0] / groups}; + } + filter_.Resize(filter_matrix_shape); + + int in_step = (data_layout != DataLayout::kNHWC + ? static_cast(x_dims[1]) / groups + : static_cast(x_dims[x_dims.size() - 1]) / groups); + int col_step = static_cast(col_matrix_shape[0]) / groups; + + // convolution transpose grad on x: + // im2col + gemm (similar to conv-forward) + // x need to compute gradient + auto blas = funcs::GetBlas(ctx); + if (dx || dfilter) { + DenseTensor col; + col.Resize(col_shape); + ctx.template Alloc(&col); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + DenseTensor dfilter_; + funcs::SetConstant set_zero; + + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + funcs::ConcatFunctor concat_functor; + + if (dx) { + ctx.template Alloc(dx); + set_zero(ctx, dx, static_cast(0)); + } + if (dfilter) { // dfilter_ size (i_c, o_c/g, k_h, k_w) + ctx.template Alloc(dfilter); + set_zero(ctx, dfilter, static_cast(0)); + dfilter_ = *dfilter; + dfilter_.Resize(filter_matrix_shape); + } + + size_t D = x.dims().size(); + for (int i = 0; i < batch_size; i++) { + // batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for + // channel_first + // batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for + // channel_last + DenseTensor dout_batch = dout.Slice(i, i + 1).Resize(output_shape); + + if (data_dim == 2U) { + // im2col: dy -> col matrix + // from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for + // channel_first + // from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for + // channel_last + im2col(ctx, + dout_batch, + dilations_, + strides, + std::vector{ + paddings_[0], paddings_[2], paddings_[1], paddings_[3]}, + &col, + data_layout); + } else if (data_dim == 3U) { + // vol2col: dy -> col_matrix + // from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h * + // i_w) for channel_first + // from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h * + // k_w) for channel_last + vol2col( + ctx, dout_batch, dilations_, strides, paddings_, &col, data_layout); + } + if (dx) { + // batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c) + DenseTensor dx_batch = dx->Slice(i, i + 1).Resize(x_matrix_shape); + + // gemm: dx = filter * dy + // (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h + // * i_w) + // or + // (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h * + // i_w) -> (i_c, + // i_d, i_h, i_w) + // gemm: dx = dy^T * filter^T for channel_last + + std::vector dx_batch_vec; + for (int g = 0; g < groups; g++) { + // dx_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w) + // for channel_first + // dx_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g) + // for channel_last + // filter_slice: (i_c/g, o_c/g * k_h * k_w) + DenseTensor filter_slice = + filter_.Slice(g * in_step, (g + 1) * in_step); + // col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * + // k_h * k_w, d * h * w) + DenseTensor col_matrix_slice = + col_matrix.Slice(g * col_step, (g + 1) * col_step); + if (data_layout != DataLayout::kNHWC) { + DenseTensor dx_slice = + dx_batch.Slice(g * in_step, (g + 1) * in_step); + blas.MatMul(filter_slice, + false, + col_matrix_slice, + false, + static_cast(1.0), + &dx_slice, + static_cast(0.0)); + } else { + DenseTensor dx_slice; + funcs::Slice( + ctx, &dx_batch, &dx_slice, g * in_step, (g + 1) * in_step, 1); + blas.MatMul(col_matrix_slice, + true, + filter_slice, + true, + static_cast(1.0), + &dx_slice, + static_cast(0.0)); + DDim dx_slice_shape; + if (data_dim == 2U) { + dx_slice_shape = {x_dims[1], x_dims[2], in_step}; + } else { + dx_slice_shape = {x_dims[1], x_dims[2], x_dims[3], in_step}; + } + dx_slice = dx_slice.Resize(dx_slice_shape); + dx_batch_vec.push_back(dx_slice); + } + } + if (data_layout == DataLayout::kNHWC) { + concat_functor(ctx, dx_batch_vec, static_cast(D - 2), &dx_batch); + } + } + if (dfilter) { + // x batch: (i_c, i_h * i_w) or (i_h, i_w * i_c) + DenseTensor in_batch = x.Slice(i, i + 1).Resize(x_matrix_shape); + // gemm: d_filter = x * dy^T + // (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h + // * k_w) + // or + // (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w) + // -> (i_c, o_c * k_d * + // k_h * k_w) + // gemm: d_filter = x^T * dy^T for channel_last + + for (int g = 0; g < groups; g++) { + DenseTensor dfilter_slice = + dfilter_.Slice(g * in_step, (g + 1) * in_step); + DenseTensor col_matrix_slice = + col_matrix.Slice(g * col_step, (g + 1) * col_step); + if (data_layout != DataLayout::kNHWC) { + DenseTensor in_batch_slice = + in_batch.Slice(g * in_step, (g + 1) * in_step); + blas.MatMul(in_batch_slice, + false, + col_matrix_slice, + true, + static_cast(1.0), + &dfilter_slice, + static_cast(1.0)); + } else { + DenseTensor in_batch_slice; + funcs::Slice(ctx, + &in_batch, + &in_batch_slice, + g * in_step, + (g + 1) * in_step, + 1); + blas.MatMul(in_batch_slice, + true, + col_matrix_slice, + true, + static_cast(1.0), + &dfilter_slice, + static_cast(1.0)); + } + } + } + } + } +} + +template +void Conv2dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + ConvTransposeGradRawKernel(ctx, + x, + filter, + dout, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + dx, + dfilter); +} + +template +void Conv3dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + ConvTransposeGradRawKernel(ctx, + x, + filter, + dout, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + dx, + dfilter); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..ee2faf761fe3263c892248ca2f243f7f86d7d038 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h @@ -0,0 +1,278 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/conv_transpose_kernel.h" + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/slice.h" + +namespace phi { + +template +void ConvTransposeRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + // The filter will be reshaped, so it should not be constant + DenseTensor filter_ = filter; + std::vector paddings_ = paddings; + std::vector dilations_ = dilations; + + auto x_dims = x.dims(); + auto filter_dims = filter_.dims(); + auto out_dims = out->dims(); + const int batch_size = static_cast(x.dims()[0]); + + DDim in_data_dims; + if (data_layout != DataLayout::kNHWC) { + in_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } else { + in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize); + + // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first + // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last + std::vector x_shape_vec = vectorize(x.dims()); + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec = vectorize(filter_.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + if (data_layout != DataLayout::kNHWC) { + col_shape_vec[0] = out_dims[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2]; + } + } else { + col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1]; + } + } + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + + DenseTensor col; + col.Resize(col_shape); + ctx.template Alloc(&col); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first + // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last + DDim out_shape = slice_ddim(out->dims(), 1, out->dims().size()); + + // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first + // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last + DDim x_matrix_shape; + if (data_layout != DataLayout::kNHWC) { + x_matrix_shape = {x_dims[1], col_matrix_shape[1]}; + } else { + x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]}; + } + + // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w) + DDim filter_matrix_shape; + if (data_layout != DataLayout::kNHWC) { + filter_matrix_shape = {x_dims[1], col_matrix_shape[0]}; + } else { + filter_matrix_shape = {x_dims[x_dims.size() - 1], col_matrix_shape[0]}; + } + filter_.Resize(filter_matrix_shape); + + ctx.template Alloc(out); + + funcs::SetConstant set_zero; + + auto blas = funcs::GetBlas(ctx); + set_zero(ctx, out, static_cast(0)); + + int in_step = (data_layout != DataLayout::kNHWC + ? static_cast(x_dims[1]) / groups + : static_cast(x_dims[x_dims.size() - 1]) / groups); + + int out_step = + (data_layout != DataLayout::kNHWC + ? static_cast(out_dims[1]) / groups + : static_cast(out_dims[out_dims.size() - 1]) / groups); + paddle::operators::math:: + Col2ImFunctor + col2im; + paddle::operators::math::Col2VolFunctor col2vol; + funcs::ConcatFunctor concat_functor; + + // convolution transpose: gemm + col2im or col2vol (similar to conv-backward + // on x) + size_t D = x.dims().size(); + for (int i = 0; i < batch_size; i++) { + // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first + // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last + DenseTensor x_batch = x.Slice(i, i + 1).Resize(x_matrix_shape); + + // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first + // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last + DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_shape); + + std::vector out_batch_vec; + for (int g = 0; g < groups; g++) { + int64_t start = g * in_step; + int64_t end = (g + 1) * in_step; + int axes = (data_layout != DataLayout::kNHWC ? 0 : 1); + DenseTensor filter_slice = filter_.Slice(g * in_step, (g + 1) * in_step); + DenseTensor in_slice, out_slice; + + // col_matrix = filter_slice * x_slice + // of shape (o_c/g * k_h * k_w, h * w) + // or (o_c/g * k_d * k_h * k_w, d * h * w) + if (data_layout != DataLayout::kNHWC) { + in_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); + out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(filter_slice, + true, + in_slice, + false, + static_cast(1.0), + &col_matrix, + static_cast(0.0)); + } else { + funcs::Slice(ctx, &x_batch, &in_slice, start, end, axes); + start = g * out_step; + end = (g + 1) * out_step; + axes = D - 2; + if (D == 4U) { + funcs::Slice( + ctx, &out_batch, &out_slice, start, end, axes); + } else if (D == 5U) { + funcs::Slice( + ctx, &out_batch, &out_slice, start, end, axes); + } + blas.MatMul(filter_slice, + true, + in_slice, + true, + static_cast(1.0), + &col_matrix, + static_cast(0.0)); + } + + if (data_dim == 2U) { + // col2im: col_matrix -> dy from (o_c/g * k_h * k_w, h * w) to (o_c/g, + // o_h, o_w) or (o_h, o_w, o_c/g) + col2im(ctx, + col, + dilations_, + strides, + std::vector{ + paddings_[0], paddings_[2], paddings_[1], paddings_[3]}, + &out_slice, + data_layout); + } else if (data_dim == 3U) { + // col2vol: col_matrix -> dy from (o_c/g * k_d * k_h * k_w, d * h * w) + // to (o_c/g, o_d, o_h, o_w) or (o_d, o_h, o_w, o_c/g) + col2vol( + ctx, col, dilations_, strides, paddings_, &out_slice, data_layout); + } + if (data_layout == DataLayout::kNHWC) { + out_batch_vec.push_back(out_slice); + } + } + if (data_layout == DataLayout::kNHWC) { + concat_functor(ctx, out_batch_vec, static_cast(D - 2), &out_batch); + } + } +} + +template +void Conv2dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + ConvTransposeRawKernel(ctx, + x, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + +template +void Conv3dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + ConvTransposeRawKernel(ctx, + x, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..65d903a7fe426c6eed6cba6f38e8c636001d47b0 --- /dev/null +++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void FrobeniusNormGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* dx) { + ReduceGradKernel( + ctx, x, dout, out, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..8577a4e3c634567a7900a47a942ff7d8b6f3686c --- /dev/null +++ b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/frobenius_norm_kernel.h" + +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void FrobeniusNormKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + Reduce( + ctx, x, reduce_all, axis, keep_dim, x.dtype(), out); +} + +} // namespace phi diff --git a/paddle/phi/kernels/pad3d_grad_kernel.h b/paddle/phi/kernels/pad3d_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..38f1e5335e8c240058fb3b52a8ae59a0c438b61c --- /dev/null +++ b/paddle/phi/kernels/pad3d_grad_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pad3dGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/pad3d_kernel.h b/paddle/phi/kernels/pad3d_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..d8876c3e7bc74f6f03413f00279bfaa355907c6b --- /dev/null +++ b/paddle/phi/kernels/pad3d_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pad3dKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc index 011d4c12ecefc5b69eec4bf15425aaa648666159..fa1fac5d23779597fee7f8a6e4e467c02d6d4c94 100644 --- a/paddle/phi/ops/compat/batch_norm_sig.cc +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -17,21 +17,35 @@ namespace phi { KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("batch_norm", - {"X", "Scale", "Bias", "Mean", "Variance"}, - {"momentum", - "epsilon", - "data_layout", - "is_test", - "use_global_stats", - "trainable_statistics", - "fuse_with_relu"}, - {"Y", - "MeanOut", - "VarianceOut", - "SavedMean", - "SavedVariance", - "ReserveSpace"}); + bool is_test = paddle::any_cast(ctx.Attr("is_test")); + bool use_global_stats = paddle::any_cast(ctx.Attr("use_global_stats")); + bool trainable_statistics = + paddle::any_cast(ctx.Attr("trainable_statistics")); + bool fuse_with_relu = paddle::any_cast(ctx.Attr("fuse_with_relu")); + // Dispenable `MomentumTensor` is useless now + if (is_test && !use_global_stats && !trainable_statistics && + !fuse_with_relu) { + return KernelSignature("batch_norm_infer", + {"X", "Scale", "Bias", "Mean", "Variance"}, + {"momentum", "epsilon", "data_layout"}, + {"Y", "MeanOut", "VarianceOut"}); + } else { + return KernelSignature("batch_norm", + {"X", "Scale", "Bias", "Mean", "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"Y", + "MeanOut", + "VarianceOut", + "SavedMean", + "SavedVariance", + "ReserveSpace"}); + } } KernelSignature BatchNormGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/conv_transpose_sig.cc b/paddle/phi/ops/compat/conv_transpose_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..8697168b8274736ef0eb2db58135283928d3611c --- /dev/null +++ b/paddle/phi/ops/compat/conv_transpose_sig.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Conv2dTransposeOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_transpose", + {"Input", "Filter"}, + {"strides", + "paddings", + "output_padding", + "output_size", + "padding_algorithm", + "groups", + "dilations", + "data_format"}, + {"Output"}); +} + +KernelSignature Conv2dTransposeGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_transpose_grad", + {"Input", "Filter", GradVarName("Output")}, + {"strides", + "paddings", + "output_padding", + "output_size", + "padding_algorithm", + "groups", + "dilations", + "data_format"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature Conv2dTransposeDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_transpose_grad_grad", + {"Input", "Filter", "DOutput", "DDInput", "DDFilter"}, + {"strides", + "paddings", + "output_padding", + "output_size", + "padding_algorithm", + "groups", + "dilations", + "data_format"}, + {"DInput", "DFilter", "DDOutput"}); +} + +KernelSignature Conv3dTransposeOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d_transpose", + {"Input", "Filter"}, + {"strides", + "paddings", + "output_padding", + "output_size", + "padding_algorithm", + "groups", + "dilations", + "data_format"}, + {"Output"}); +} + +KernelSignature Conv3dTransposeGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d_transpose_grad", + {"Input", "Filter", GradVarName("Output")}, + {"strides", + "paddings", + "output_padding", + "output_size", + "padding_algorithm", + "groups", + "dilations", + "data_format"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature DepthwiseConv2dTransposeOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_transpose", + {"Input", "Filter"}, + {"strides", + "paddings", + "output_padding", + "output_size", + "padding_algorithm", + "groups", + "dilations", + "data_format"}, + {"Output"}); +} + +KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_transpose_grad", + {"Input", "Filter", GradVarName("Output")}, + {"strides", + "paddings", + "output_padding", + "output_size", + "padding_algorithm", + "groups", + "dilations", + "data_format"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose, + phi::Conv2dTransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad, + phi::Conv2dTransposeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad_grad, + phi::Conv2dTransposeDoubleGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose, + phi::Conv3dTransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose_grad, + phi::Conv3dTransposeGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose, + phi::DepthwiseConv2dTransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose_grad, + phi::DepthwiseConv2dTransposeGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/frobenius_norm_sig.cc b/paddle/phi/ops/compat/frobenius_norm_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..c6dc5ad9014ecd06b902be304d2c2752d0934713 --- /dev/null +++ b/paddle/phi/ops/compat/frobenius_norm_sig.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature FrobeniusNormOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "frobenius_norm", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); +} + +KernelSignature FrobeniusNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "frobenius_norm_grad", + {"X", "Out", GradVarName("Out")}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(frobenius_norm, phi::FrobeniusNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(frobenius_norm_grad, + phi::FrobeniusNormGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/pad3d_sig.cc b/paddle/phi/ops/compat/pad3d_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..c43b98fa27e6baef55ad1dcbc11cb764ba9cb944 --- /dev/null +++ b/paddle/phi/ops/compat/pad3d_sig.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Pad3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Paddings")) { + return KernelSignature( + "pad3d", {"X"}, {"Paddings", "mode", "value", "data_format"}, {"Out"}); + } + + return KernelSignature( + "pad3d", {"X"}, {"paddings", "mode", "value", "data_format"}, {"Out"}); +} + +KernelSignature Pad3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Paddings")) { + return KernelSignature("pad3d_grad", + {"X", GradVarName("Out")}, + {"Paddings", "mode", "value", "data_format"}, + {GradVarName("X")}); + } + return KernelSignature("pad3d_grad", + {"X", GradVarName("Out")}, + {"paddings", "mode", "value", "data_format"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(pad3d, phi::Pad3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pad3d_grad, phi::Pad3dGradOpArgumentMapping); diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index bf6556d21e9fc78bc4bdc9d496b60dcb799b3d29..fde3805914d808118b59fffc5d6b219bcadbb2e7 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -14,6 +14,7 @@ import numpy as np import os +from datetime import timedelta from ..fluid.layer_helper import LayerHelper from ..fluid.framework import Variable from ..fluid.framework import OpProtoHolder @@ -73,6 +74,7 @@ class ReduceOp: MAX = 1 MIN = 2 PROD = 3 + AVG = 4 class Group(): @@ -80,11 +82,13 @@ class Group(): The abstract representation of group. """ - def __init__(self, rank, rank_num, id=0, ranks=[]): + def __init__(self, rank, rank_num, id=0, ranks=[], pg=None, name=None): self.rank = rank self.nranks = rank_num self.id = id self.ranks = ranks + self.pg = pg + self.name = name def is_member(self): if self.rank < 0: @@ -99,11 +103,16 @@ class Group(): else: return -1 + @property + def process_group(self): + return self.pg + def __repr__(self): debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format( self.rank, self.nranks, self.id) debug_str += ", ".join(map(str, self.ranks)) - debug_str += ". " + debug_str += "; name: " + debug_str += self.name if self.name else "None" return debug_str @@ -121,6 +130,17 @@ def _get_global_env(): # Dict[int, Group] _group_map = {} +# group map by name : the map of all groups from their names +# Dict[name, Group] +_group_map_by_name = {} + +# Name of the default group for init_parallel_env +_default_group_name = "_default_pg" + +_valid_backend_list = ['nccl', 'gloo', 'hccl'] +_default_store = None # the default tcp store +_default_backend = None + def _get_group_map(): global _group_map @@ -135,10 +155,29 @@ def _get_global_group(): return _get_group_map()[0] +def _get_group_map_by_name(): + global _group_map_by_name + assert _default_group_name in _group_map_by_name, ( + "Call paddle.distributed.init_parallel_env first " + "to initialize the distributed environment.") + return _group_map_by_name + + +def _get_default_group(): + assert _default_group_name in _group_map_by_name, ( + "Call paddle.distributed.init_parallel_env first " + "to initialize the distributed environment.") + return _get_group_map_by_name()[_default_group_name] + + def _new_ring_id(): return len(_get_group_map()) + max(_get_global_env().nrings, 9) +def _new_group_name_id(): + return len(_get_group_map_by_name()) + max(_get_global_env().nrings, 9) + + def get_group(id=0): """ @@ -163,6 +202,194 @@ def get_group(id=0): return gm[id] if id in gm else None +def _new_process_group_impl(backend, store, rank, world_size, group_name, + pg_options): + if backend == "gloo": + gloo_store = core.GlooStore(store) + + pg = None + if backend == "gloo": + pg = core.ProcessGroupGloo(gloo_store, rank, world_size) + elif backend == "nccl": + pg = core.ProcessGroupNCCL(store, rank, world_size) + elif backend == "hccl": + pg = core.ProcessGroupHCCL(store, rank, world_size) + + return pg + + +def _init_parallel_env(rank=None, + world_size=None, + backend="nccl", + timeout=timedelta(0), + pg_options=None): + """ + + Initializes the default distributed environment. + + Args: + rank (int, optional): the rank of the current process or device from 0 to world_size (exclusive). + If you launch your training with paddle.distributed.run or + paddle.distributed.launch module, None can be given. Default: None. + world_size (int, optional): total number of processes or devices. + If you launch your training with paddle.distributed.run or + paddle.distributed.launch module, None can be given. Default: None. + backend (str, optional): the name of the backend used to initialize + the distributed environment. The value can be one of 'nccl' for + GPU, 'gloo' for CPU or 'hccl' for NPU. Default: 'nccl'. + timeout (datetime.timedelta, optional): timeout used for operations of + the group. Default: datetime.timedelta(0) which means no timeout. + pg_options (dict, optional): options for the group. Default: None. + + Returns: + Group: a group. + + Examples: + + .. code-block:: python + + # filename: train.py + import paddle + paddle.distributed.init_parallel_env(0, 1) + + # how to start + # python paddle.distributed.run --gpus="0,1" train.py + + """ + + global _group_map_by_name + global _default_group_name + assert _default_group_name not in _group_map_by_name, ( + "The default distributed environment has been initialized.") + + assert backend in _valid_backend_list, ( + "Backend must be one of {}, but the given one is: {}".format( + _valid_backend_list, backend)) + _default_backend = backend + + assert isinstance(timeout, timedelta), ( + "timeout must be of the type datetime.timedelta.") + + if rank is None or world_size is None: + assert rank is None and world_size is None, ( + "rank and world_size should be unset at the same time.") + trainer_id = os.getenv("PADDLE_TRAINER_ID", None) + trainer_num = os.getenv("PADDLE_TRAINERS_NUM", None) + if trainer_id is None or trainer_num is None: + warnings.warn("If rank and world_size are both None, please start " + "your training with paddle.distributed.run or " + "paddle.distributed.launch module. Otherwise, " + "init_parallel_env will do nothing.") + return None + rank = int(trainer_id) + world_size = int(trainer_num) + + assert rank >= 0 and world_size > rank and world_size > 1, ( + "rank must be non-negative and world_size must be the " + "maximum rank plus one. Moreover, at least two processes are " + "required to create a process group.") + + master_addr = os.getenv("MASTER_ADDR", None) + master_port = os.getenv("MASTER_PORT", None) + if not master_addr or not master_port: + endpoints = os.getenv("PADDLE_MASTER", None) + if endpoints is None: + endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None) + if not endpoints: + raise ValueError( + "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' " + "must be specified, for example 'export MASTER_ADDR=127.0.0.1' " + "and 'export MASTER_ADDR=54612'. Or you can start your training" + "with paddle.distributed.run or " + "paddle.distributed.luanch module.") + if ',' in endpoints: + endpoints = endpoints.split(',')[0] + master_addr, master_port = endpoints.split(":") + + master_port = int(master_port) + + is_master = rank == 0 + global _default_store + _default_store = core.TCPStore(master_addr, master_port, is_master, + world_size, timeout) + + pg = _new_process_group_impl(backend, _default_store, rank, world_size, + _default_group_name, pg_options) + ranks = list(range(world_size)) + group = Group( + rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name) + + paddle.fluid.dygraph.parallel_helper._set_parallel_ctx(True) + _group_map_by_name[_default_group_name] = group + return group + + +def _new_group(ranks=None, + backend=None, + group_name=None, + timeout=timedelta(0), + pg_options=None): + """ + Create a new process group. + + Args: + ranks (list, optional): list of ranks for the new group. If None is given, + all processes is used. Default: None. + backend (str, optional): the name of the backend used to initialize + the distributed environment. Default: the one for init_parallel_env. + timeout (datetime.timedelta, optional): timeout used for operations of + the group. Default: datetime.timedelta(0). + pg_options (dict, optional): options for the group. Default: None. + + Examples: + + .. code-block:: python + + import paddle + paddle.distributed.init_parallel_env(0, 1) + paddle.distributed.new_group([0, 1]) + + # how to start + # python paddle.distributed.run --gpus="0,1" train.py + + """ + global _default_group_name + if group_name is None: + group_name = _default_group_name + str(_new_group_name_id()) + if group_name == _default_group_name: + raise ValueError("group_name must be specified and it cannot be '{}' " + "which is used for the default process group created " + "by init_parallel_env.".format(_default_group_name)) + global_group = _get_default_group() + global_rank = global_group.rank + global_ranks = global_group.ranks + if ranks is None: + ranks = global_ranks + assert len(ranks) <= len(global_ranks), ( + "Size of new group must be less than or " + "equal to that of the default global group.") + size = len(ranks) + assert size > 1, "A group must have at least two memebers." + ranks = sorted(ranks) + if global_rank in ranks: + rank = ranks.index(global_rank) + pg = _new_process_group_impl(backend, _default_store, rank, size, + group_name, pg_options) + else: + rank = -1 + pg = None + group = Group( + rank, + size, + id=_new_group_name_id(), + ranks=ranks, + pg=pg, + name=group_name) + _group_map_by_name[group_name] = group + + return group + + def barrier(group=None): """ diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index b8a696057e7800d9f7d3298762945462861e6b4b..d21b7e4740a6e1b0194c71f039f7706f32fae742 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -542,7 +542,7 @@ class IpuStrategy(object): def set_graph_config(self, num_ipus=1, is_training=True, - batch_size=1, + micro_batch_size=1, enable_manual_shard=False): """ Set graph configuration to the IpuStrategy instance. @@ -571,7 +571,7 @@ class IpuStrategy(object): ipu_strategy = static.IpuStrategy() ipu_strategy.set_graph_config(num_ipus=1, is_training=True, - batch_size=1, + micro_batch_size=1, enable_manual_shard=False) """ if num_ipus == 1 and enable_manual_shard: @@ -581,7 +581,7 @@ class IpuStrategy(object): options = { 'num_ipus': num_ipus, 'is_training': is_training, - 'micro_batch_size': batch_size, + 'micro_batch_size': micro_batch_size, 'enable_manual_shard': enable_manual_shard, } self.set_options(options) @@ -589,6 +589,7 @@ class IpuStrategy(object): def set_pipelining_config(self, enable_pipelining=False, batches_per_step=1, + enable_gradient_accumulation=False, accumulation_factor=1): """ Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance. @@ -598,6 +599,8 @@ class IpuStrategy(object): Default False, which means disabled. batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1. Default 1, which means no data pipelining. + enable_gradient_accumulation (bool, optional): Enable to accumulate gradients before updating the weights in training mode. Only if enable_pipelining=True, + enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation. accumulation_factor (int, optional): Specify the number of micro-batches to accumulate before applying the varUpdate. Default 1, which means disable the accumulation. @@ -617,6 +620,7 @@ class IpuStrategy(object): ipu_strategy = static.IpuStrategy() ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, + enable_gradient_accumulation=False, accumulation_factor=1) """ enable_manual_shard = self.get_option('enable_manual_shard') @@ -627,6 +631,7 @@ class IpuStrategy(object): options = { 'enable_pipelining': enable_pipelining, 'batches_per_step': batches_per_step, + 'enable_gradient_accumulation': enable_gradient_accumulation, 'accumulation_factor': accumulation_factor, } self.set_options(options) @@ -754,6 +759,56 @@ class IpuStrategy(object): """ return self._ipu_strategy.get_option(option)['value'] + def enable_pattern(self, pattern): + """ + Enable PopART pattern to optimize the graph. + + Args: + pattern(string): the name of the pattern. + + Returns: + None. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + paddle.enable_static() + + ipu_strategy = static.IpuStrategy() + ipu_strategy.enable_pattern("ViewSimplifyPattern") + """ + self._ipu_strategy.enable_pattern(pattern) + + def disable_pattern(self, pattern): + """ + Disable PopART pattern. + + Args: + pattern(string): the name of the pattern. + + Returns: + None. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + paddle.enable_static() + + ipu_strategy = static.IpuStrategy() + ipu_strategy.disable_pattern("ViewSimplifyPattern") + """ + self._ipu_strategy.disable_pattern(pattern) + @property def num_ipus(self): """ @@ -817,8 +872,8 @@ class IpuCompiledProgram(object): main_prog = static.default_main_program() ipu_strategy = static.IpuStrategy() - ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1) - ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1) + ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1) + ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1) ipu_strategy.set_precision_config(enable_fp16=False) ipu_compiled_program = static.IpuCompiledProgram( @@ -891,8 +946,8 @@ class IpuCompiledProgram(object): main_prog = static.default_main_program() ipu_strategy = static.IpuStrategy() - ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1) - ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1) + ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1) + ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1) ipu_strategy.set_precision_config(enable_fp16=False) program = static.IpuCompiledProgram( diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 9439982858530e1e81156be4b32ef2d91dc4a33a..b4c5a36d288b7ee0f6e771d72b21bd54d1e3d669 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -612,7 +612,7 @@ def grad(outputs, if no_grad_vars is None: no_grad_vars = [] - elif isinstance(no_grad_vars, core.VarBase): + elif isinstance(no_grad_vars, (core.VarBase, core.eager.Tensor)): no_grad_vars = [no_grad_vars] elif isinstance(no_grad_vars, core.eager.Tensor): no_grad_vars = [no_grad_vars] @@ -718,13 +718,13 @@ def to_variable(value, name=None, zero_copy=None, dtype=None): y.shape # [3L, 2L] """ - support_type = (list, tuple, np.ndarray, core.VarBase, framework.Variable, - core.Tensor, core.LoDTensor) + support_type = (list, tuple, np.ndarray, core.eager.Tensor, core.VarBase, + framework.Variable, core.Tensor, core.LoDTensor) if not isinstance(value, support_type): raise TypeError( "The type of 'value' in fluid.dygraph.to_variable must be %s, but received %s." % (support_type, type(value))) - if isinstance(value, (core.VarBase, framework.Variable)): + if isinstance(value, (core.eager.Tensor, core.VarBase, framework.Variable)): return value elif isinstance(value, (core.Tensor, core.LoDTensor)): return core.VarBase(value) diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index d8b1883fc62a0fb4575a2e525d7d37a9029cf40d..1a8cc77e4def59ca6bd1b01b903c4a96a4238b15 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -30,6 +30,11 @@ final_state_name_mapping = { "y": "Y", "out": "Out", }, + # "elementwise_add": { + # "final_op_name": "final_state_add", + # "x": "X", + # "y": "Y", + # }, "trunc": { "final_op_name": "final_state_trunc", "x": "X", diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 2b67a2029727f6b8f917239094a1b906d5cd6a62..af30b2b2444b44f1b27e8f277eb380557255517d 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -28,6 +28,7 @@ from .math_op_patch import monkey_patch_math_varbase from .parallel import scale_loss from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE import paddle.utils.deprecated as deprecated +from paddle import _C_ops class TensorHookRemoveHelper(object): @@ -782,7 +783,7 @@ def monkey_patch_varbase(): @framework.dygraph_only def clone(self): - return _C_ops_.assign(self) + return _C_ops.assign(self) @framework.dygraph_only def value(self): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d0a94238a7aeb21f9d1baf8154cbe3b7f2b77a72..fb787215d910e9924622147b86c328af5e1994de 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -316,7 +316,8 @@ def _dygraph_not_support_(func): def _dygraph_only_(func): def __impl__(*args, **kwargs): - assert in_dygraph_mode( + assert ( + in_dygraph_mode() or _in_eager_mode() ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__ return func(*args, **kwargs) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 000f08b0a3e282d815c758b5a153ba53ff84c8e0..6350ed18e6666216074f64812768618f98f71ed4 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5611,6 +5611,8 @@ def transpose(x, perm, name=None): """ if in_dygraph_mode(): + if _in_eager_mode(): + return _C_ops.final_state_transpose(x, perm) out, _ = _C_ops.transpose2(x, 'axis', perm) return out @@ -8550,6 +8552,8 @@ def gather_nd(input, index, name=None): """ if in_dygraph_mode(): + if _in_eager_mode(): + return _C_ops.final_state_gather_nd(input, index) return _C_ops.gather_nd(input, index) check_variable_and_dtype( input, 'input', @@ -8726,6 +8730,8 @@ def scatter_nd_add(ref, index, updates, name=None): """ if in_dygraph_mode(): + if _in_eager_mode(): + return _C_ops.final_state_scatter_nd_add(ref, index, updates) op = getattr(_C_ops, 'scatter_nd_add') return op(ref, index, updates) @@ -15292,6 +15298,8 @@ def gather_tree(ids, parents): """ if in_dygraph_mode(): + if _in_eager_mode(): + return _C_ops.final_state_gather_tree(ids, parents) return _C_ops.gather_tree(ids, parents) else: helper = LayerHelper('gather_tree', **locals()) diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 9348b0b50a1c08e7103dc3cc32169f4a6a40591c..c45045509201df89d6a07b8c0aadc7ef9130cf2f 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -886,6 +886,7 @@ class TestDistributeFpnProposals(LayerTest): refer_level=4, refer_scale=224, rois_num=rois_num_dy) + print(type(multi_rois_dy)) output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy output_dy_np = [] for output in output_dy: @@ -973,4 +974,5 @@ class TestBoxDecoderAndAssign(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py index cc2e14d6d6c2ef237351e372c75ca7e700de3fbf..341ec852c52197f689870f0a6c45141ebe318301 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py @@ -50,3 +50,7 @@ class TestExponentialFamilyException(unittest.TestCase): def test_entropy_exception(self): with self.assertRaises(NotImplementedError): paddle.distribution.ExponentialFamily.entropy(self.dist) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl.py b/python/paddle/fluid/tests/unittests/distribution/test_kl.py index a1413722446e287688d7e120a3ef31ea67cc798b..55358380c8b23fdfd512b259aca06901d5623e38 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_kl.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_kl.py @@ -112,3 +112,7 @@ class TestKLExpfamilyExpFamily(unittest.TestCase): kl._kl_expfamily_expfamily(self.p, self.q), rtol=config.RTOL.get(config.DEFAULT_DTYPE), atol=config.ATOL.get(config.DEFAULT_DTYPE)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/init_process_group.py b/python/paddle/fluid/tests/unittests/init_process_group.py new file mode 100644 index 0000000000000000000000000000000000000000..90926b1a021d3114b96ed8e7410cceb2e9b7450b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/init_process_group.py @@ -0,0 +1,49 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import random +import numpy as np +import os +import shutil + +import paddle +from paddle.fluid import core +import datetime +from datetime import timedelta +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv + + +class TestProcessGroupFp32(unittest.TestCase): + def setUp(self): + self.config() + + def config(self): + pass + + def test_init_process_group(self): + paddle.distributed.collective._init_parallel_env() + paddle.distributed.collective._new_group() + with self.assertRaises(ValueError): + paddle.distributed.collective._new_group( + backend="gloo", group_name="_default_pg") + print("test ok\n") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py index 4f17c90de72ad62814c3653700ae21fd6f205b5d..35f4ca17d5eba69c78bfb47955e3604af8fc6854 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py @@ -98,5 +98,117 @@ class TestBase(IPUOpTest): self.check(output_dict) +class TestAssignFp32Value(TestBase): + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + data = np.random.uniform(size=[2, 3, 1]) + self.assign_fp32 = data.astype(np.float32) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + assign = paddle.assign(self.assign_fp32) + out = paddle.fluid.layers.elementwise_add(x, assign) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + +class TestAssignBoolValue(TestBase): + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + data = np.random.choice([True, False], size=(2, 3, 1)) + self.assign_bool = data.astype(np.bool) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + x = paddle.less_than(x, x) + assign = paddle.assign(self.assign_bool) + out = paddle.logical_and(x, assign) + out = paddle.cast(out, 'float32') + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py index 05a37dcb3d51475e896d6a8f1e9458b31ece85b5..934ad1014282703a4660e25725015fa588bb379a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py @@ -22,33 +22,18 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMod @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") -class TestBase(IPUOpTest): +class TestGreaterThan(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_data_feed() - self.set_feed_attr() - self.set_op_attrs() + self.set_test_op() @property def fp16_enabled(self): return True - def set_data_feed(self): - x = np.random.randn(3, 4, 5) - y = np.random.randn(3, 4, 5) - self.feed_fp32 = { - "x": x.astype(np.float32), - "y": y.astype(np.float32), - } - self.feed_fp16 = { - "x": x.astype(np.float16), - "y": y.astype(np.float16), - } - - def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed_fp32.values()] - self.feed_list = list(self.feed_fp32.keys()) + def set_test_op(self): + self.op = paddle.fluid.layers.greater_than def set_op_attrs(self): self.attrs = {} @@ -71,7 +56,7 @@ class TestBase(IPUOpTest): shape=self.feed_shape[1], dtype='float32') - out = paddle.fluid.layers.greater_than(x, y, **self.attrs) + out = self.op(x, y, **self.attrs) fetch_list = [out.name] @@ -102,7 +87,7 @@ class TestBase(IPUOpTest): result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test(self): + def run_test_base(self): output_dict = {} for mode in ExecutionMode: if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: @@ -111,29 +96,73 @@ class TestBase(IPUOpTest): self.check(output_dict) + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_data_feed0(self): + x = np.random.randn(3, 4, 5) + y = np.random.randn(3, 4, 5) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), + } + self.set_feed_attr() -class TestCase1(TestBase): - def set_data_feed(self): + def set_data_feed1(self): x = np.ones([1, 10]) y = np.ones([10]) self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + self.set_feed_attr() - -class TestCase2(TestBase): - def set_data_feed(self): + def set_data_feed2(self): x = np.ones([1, 10]) y = np.zeros([1, 10]) self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + self.set_feed_attr() - -class TestCase3(TestBase): - def set_data_feed(self): + def set_data_feed3(self): x = np.zeros([1, 10]) y = np.ones([1, 10]) self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + self.set_feed_attr() + + def test_case0(self): + self.set_data_feed0() + self.set_op_attrs() + self.run_test_base() + + def test_case1(self): + self.set_data_feed1() + self.set_op_attrs() + self.run_test_base() + + def test_case2(self): + self.set_data_feed2() + self.set_op_attrs() + self.run_test_base() + + def test_case3(self): + self.set_data_feed3() + self.set_op_attrs() + self.run_test_base() + + +class TestLessThan(TestGreaterThan): + def set_test_op(self): + self.op = paddle.fluid.layers.less_than + + +class TestEqual(TestGreaterThan): + def set_test_op(self): + self.op = paddle.fluid.layers.equal if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py index 026b19eccf18721ba7be062d3fd4516deefb2aa0..76ab1a2c3f311f9443ba7e225c988e2a5133ba27 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function +import unittest import numpy as np -import unittest import paddle +import paddle.static paddle.enable_static() @@ -26,30 +26,31 @@ paddle.enable_static() class TestIpuShard(unittest.TestCase): def _test(self): # build graph - a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') - b = a + 2 # scale : scale * x + bias, ipu_index : no + main_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog): + a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') + b = a + 2 # scale : scale * x + bias, ipu_index : no + + with paddle.static.ipu_shard_guard(index=1): + c = b + 1 # scale, ipu_index : 1 + with paddle.static.ipu_shard_guard(index=2): + d = c * 2 # scale, ipu_index : 2 + with paddle.static.ipu_shard_guard(index=3): + e = d + 3 # scale, ipu_index : 3 + with paddle.static.ipu_shard_guard(index=1): + e = e + 3 # scale, ipu_index : 1 + with paddle.static.ipu_shard_guard(index=2): + e = e + 3 # scale, ipu_index : 2 + + with paddle.static.ipu_shard_guard(index=1): + f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1 - with paddle.static.ipu_shard_guard(index=1): - c = b + 1 # scale, ipu_index : 1 with paddle.static.ipu_shard_guard(index=2): - d = c * 2 # scale, ipu_index : 2 - with paddle.static.ipu_shard_guard(index=3): - e = d + 3 # scale, ipu_index : 3 - with paddle.static.ipu_shard_guard(index=1): - e = e + 3 # scale, ipu_index : 1 - with paddle.static.ipu_shard_guard(index=2): - e = e + 3 # scale, ipu_index : 2 - - with paddle.static.ipu_shard_guard(index=1): - f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1 + g = f - 1 # scale, ipu_index : 2 - with paddle.static.ipu_shard_guard(index=2): - g = f - 1 # scale, ipu_index : 2 - - h = g + 1 # scale, ipu_index : no + h = g + 1 # scale, ipu_index : no ipu_index_list = [] - main_prog = paddle.static.default_main_program() for op in main_prog.global_block().ops: if op.desc.has_attr("ipu_index"): ipu_index_list.append(op.desc.attr("ipu_index")) @@ -69,30 +70,31 @@ class TestIpuShard(unittest.TestCase): class TestIpuPipeline(unittest.TestCase): def _test(self): # build graph - a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') - b = a + 2 # scale : scale * x + bias, ipu_stage : no + main_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog): + a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') + b = a + 2 # scale : scale * x + bias, ipu_stage : no + + with paddle.static.ipu_shard_guard(stage=1): + c = b + 1 # scale, ipu_stage : 1 + with paddle.static.ipu_shard_guard(stage=2): + d = c * 2 # scale, ipu_stage : 2 + with paddle.static.ipu_shard_guard(stage=3): + e = d + 3 # scale, ipu_stage : 3 + with paddle.static.ipu_shard_guard(stage=1): + e = e + 3 # scale, ipu_stage : 1 + with paddle.static.ipu_shard_guard(stage=2): + e = e + 3 # scale, ipu_stage : 2 + + with paddle.static.ipu_shard_guard(stage=1): + f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1 - with paddle.static.ipu_shard_guard(stage=1): - c = b + 1 # scale, ipu_stage : 1 with paddle.static.ipu_shard_guard(stage=2): - d = c * 2 # scale, ipu_stage : 2 - with paddle.static.ipu_shard_guard(stage=3): - e = d + 3 # scale, ipu_stage : 3 - with paddle.static.ipu_shard_guard(stage=1): - e = e + 3 # scale, ipu_stage : 1 - with paddle.static.ipu_shard_guard(stage=2): - e = e + 3 # scale, ipu_stage : 2 - - with paddle.static.ipu_shard_guard(stage=1): - f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1 - - with paddle.static.ipu_shard_guard(stage=2): - g = f - 1 # scale, ipu_stage : 2 + g = f - 1 # scale, ipu_stage : 2 - h = g + 1 # scale, ipu_stage : no + h = g + 1 # scale, ipu_stage : no ipu_index_list = [] - main_prog = paddle.static.default_main_program() for op in main_prog.global_block().ops: if op.desc.has_attr("ipu_stage"): ipu_index_list.append(op.desc.attr("ipu_stage")) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py index f120f5594914e8e555bd0b024fa55224e2781f9b..debd9ed19827cd3cf9137f7a4043550a5065201c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py @@ -26,7 +26,13 @@ class TestIpuStrategy(unittest.TestCase): def test_set_options(self): ipu_strategy = paddle.static.IpuStrategy() all_option_names = ipu_strategy._ipu_strategy.get_all_option_names() + skip_options = [] + skip_options.append('random_seed') + for option_name in all_option_names: + if option_name in skip_options: + continue + option = ipu_strategy._ipu_strategy.get_option(option_name) option_type = option['type'] option_value = option['value'] @@ -38,9 +44,13 @@ class TestIpuStrategy(unittest.TestCase): set_value = not option_value else: continue - ipu_strategy.set_options({option_name: set_value}) - new_value = ipu_strategy.get_option(option_name) - assert new_value == set_value, f"set {option_name} to {set_value} failed" + + try: + ipu_strategy.set_options({option_name: set_value}) + new_value = ipu_strategy.get_option(option_name) + assert new_value == set_value, f"set {option_name} to {set_value} failed" + except: + raise Exception(f"set {option_name} to {set_value} failed") def test_set_string_options(self): ipu_strategy = paddle.static.IpuStrategy() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py new file mode 100644 index 0000000000000000000000000000000000000000..05572a72ea8b20a2c354d1b8987e1f585f2d8ca4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py @@ -0,0 +1,121 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestLogicalAnd(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_test_op() + + @property + def fp16_enabled(self): + return False + + def set_test_op(self): + self.op = paddle.fluid.layers.logical_and + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + y = paddle.static.data( + name=self.feed_list[1], + shape=self.feed_shape[1], + dtype=self.feed_dtype[1]) + + out = self.op(x, y, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + return result[0] + + def run_test_base(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).astype(np.int32) + + self.check(output_dict, check_shape=True) + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = ['bool', 'bool'] + + def set_data_feed0(self): + x = np.random.choice([True, False], size=(1, 3, 5, 5)) + y = np.random.choice([True, False], size=(1, 3, 5, 5)) + self.feed = { + "x": x.astype('bool'), + "y": y.astype('bool'), + } + self.set_feed_attr() + + def test_case0(self): + self.set_data_feed0() + self.set_op_attrs() + self.run_test_base() + + +class TestLogicalOr(TestLogicalAnd): + def set_test_op(self): + self.op = paddle.fluid.layers.logical_or + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py new file mode 100644 index 0000000000000000000000000000000000000000..33a5dc888c2453fcc90b6fb0724e714a924ffbb4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data1 = np.array([[1], [1], [3], [0]]) + + self.feed = {'x': data1.astype(np.int32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + + def set_op_attrs(self): + self.attrs = {"depth": 4, "allow_out_of_range": False} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int32') + + out = paddle.fluid.layers.one_hot(x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled): + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +@unittest.skip('does not support allow_out_of_range=True') +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"depth": 4, "allow_out_of_range": True} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py new file mode 100644 index 0000000000000000000000000000000000000000..79fc9b04e1674c71337f8c121b4798f7bcfed591 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data1 = np.array([[1], [1], [3], [0]]) + + self.feed = {'x': data1.astype(np.int32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + + def set_op_attrs(self): + self.attrs = {"depth": 4, "allow_out_of_range": False} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int32') + + out = paddle.fluid.input.one_hot(x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled): + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +@unittest.skip('does not support allow_out_of_range=True') +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"depth": 4, "allow_out_of_range": True} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py index 1cc10da3d73444f329bbcbf53694c9b4ff93fdfc..bc9d05c4a87ecf6718ebc1a6cfef3c0b7ad03f31 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py @@ -91,6 +91,15 @@ class TestBase(IPUOpTest): ipu_strategy = paddle.static.IpuStrategy() ipu_strategy.set_graph_config(is_training=True) ipu_strategy.loss_scaling = self.attrs["loss_scaling"] + if "use_no_bias_optimizer" in self.attrs.keys(): + ipu_strategy.set_options({ + "use_no_bias_optimizer": + self.attrs["use_no_bias_optimizer"] + }) + if "accl1_type" in self.attrs.keys(): + ipu_strategy.set_options({ + "accl1_type": self.attrs["accl1_type"] + }) program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) @@ -141,6 +150,28 @@ class TestAdamCase2(TestBase): } +@unittest.skip('cpu do not support AdamNoBias') +class TestAdamNoBias(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'adam', + "weight_decay": 0.0, + "loss_scaling": 4.0, + "use_no_bias_optimizer": True, + } + + +@unittest.skip('cpu do not support FLOAT16') +class TestAdamCase3(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'adam', + "weight_decay": 0.0, + "loss_scaling": 4.0, + "accl1_type": "FLOAT16", + } + + @unittest.skip('seems cpu output wrong') class TestLambCase1(TestBase): def set_attrs(self): @@ -161,5 +192,27 @@ class TestLamb(TestBase): } +@unittest.skip('cpu do not support LambNoBias') +class TestLambNoBias(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'lamb', + "weight_decay": 0.1, + "loss_scaling": 6.0, + "use_no_bias_optimizer": True + } + + +@unittest.skip('cpu do not support FLOAT16') +class TestLambCase2(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'lamb', + "weight_decay": 0.1, + "loss_scaling": 6.0, + "accl1_type": "FLOAT16" + } + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py index 3a694873062080d49237299afe6a4171dc4fa242..ba6eb4d38bcf22830f3320ab8c29f30e7173805d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py @@ -95,12 +95,9 @@ class TestBase(IPUOpTest): is_training=self.attrs['is_training']) ipu_strategy.set_precision_config( enable_fp16=self.attrs['enable_fp16']) - ipu_strategy.set_options({ - 'save_per_n_step': self.attrs['save_at_step'] - }) - program = paddle.static.IpuCompiledProgram( - main_prog, ipu_strategy=ipu_strategy).compile( - self.feed_list, fetch_list) + ipu_program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy) + program = ipu_program.compile(self.feed_list, fetch_list) result = [] run_steps = self.attrs['steps'] if save_otherwise_load \ @@ -111,10 +108,9 @@ class TestBase(IPUOpTest): for i in range(run_steps): tmp = exe.run(program, feed=feed, fetch_list=fetch_list) - # currently, we update opt state every sess.run, - # will optimize if save_otherwise_load and \ i == self.attrs['save_at_step'] - 1: + ipu_program._backend.weights_to_host() paddle.static.save(main_prog, self.attrs['model_path'].name) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py index 9a18922f35331bd23ab9cd40ff6a4dea1f446ce5..6702ae4344e91eb07e08931803b821bb73f31944 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py @@ -88,11 +88,10 @@ class TestBase(IPUOpTest): if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.set_graph_config(is_training=self.is_training) + ipu_strategy.set_graph_config( + is_training=self.is_training, micro_batch_size=2) if exec_mode == ExecutionMode.IPU_POPART_FP16: ipu_strategy.set_precision_config(enable_fp16=True) - # set batch size - ipu_strategy.micro_batch_size = 2 program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py index 737c085dde6acf5e3645b2127f42b1d8b5a7aa1d..34b6f6dc8e5453b42e10c45d5423c6e17d2d0506 100644 --- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py +++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py @@ -20,6 +20,7 @@ import sys sys.path.append("../") from op_test import OpTest +import paddle from paddle import fluid @@ -115,4 +116,5 @@ class TestSequenceConcatOpError(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py index 59395b94279ea7ec4fe43221deede7e82be8f38e..f1427d29782b969d9571f79c9a7bc62bf4e77070 100644 --- a/python/paddle/fluid/tests/unittests/test_Tensor_type.py +++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py @@ -39,6 +39,7 @@ class TensorTypeTest(unittest.TestCase): tensorx = paddle.tensor.logic.Tensor(inx) typex_str = str(type(tensorx)) + expectx = "" self.assertEqual((typex_str == expectx), True) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index ecac22553cbcda7cc2dae179603f407eddc8652a..d05c9a3c313bb634effd9280e3d9503142166ee4 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -1202,4 +1202,5 @@ class TestMultiTensorAdam(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py index 6238d7dd4a1f4574fa1fabf5d531db6d4a64df09..dcf07f495320039a7d1d7b9aa887d41da928cad0 100644 --- a/python/paddle/fluid/tests/unittests/test_addmm_op.py +++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py @@ -27,6 +27,7 @@ class TestAddMMOp(OpTest): # test basic def setUp(self): self.op_type = "addmm" + self.python_api = paddle.addmm self.dtype = np.float64 self.init_dtype_type() self.inputs = { @@ -43,19 +44,19 @@ class TestAddMMOp(OpTest): pass def test_check_output(self): - self.check_output() + self.check_output(check_eager=False) def test_check_grad_normal(self): - self.check_grad(['Input', 'X', 'Y'], 'Out') + self.check_grad(['Input', 'X', 'Y'], 'Out', check_eager=False) def test_check_grad_x(self): - self.check_grad(['X'], 'Out', no_grad_set=None) + self.check_grad(['X'], 'Out', no_grad_set=None, check_eager=False) def test_check_grad_y(self): - self.check_grad(['Y'], 'Out', no_grad_set=None) + self.check_grad(['Y'], 'Out', no_grad_set=None, check_eager=False) def test_check_grad_input(self): - self.check_grad(['Input'], 'Out', no_grad_set=None) + self.check_grad(['Input'], 'Out', no_grad_set=None, check_eager=False) class TestAddMMOpError(unittest.TestCase): @@ -167,6 +168,7 @@ class TestAddMMOp2(TestAddMMOp): # test alpha and beta def setUp(self): self.op_type = "addmm" + self.python_api = paddle.addmm self.dtype = np.float64 self.init_dtype_type() self.inputs = { @@ -252,4 +254,5 @@ class TestAddMMAPI(unittest.TestCase): ''' if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_atan2_op.py b/python/paddle/fluid/tests/unittests/test_atan2_op.py index b29ab822f25de3d9b16dd903c863dd36d105dd5d..ca0e2d2ba6ddac6e870169ed57a5612a95d2199f 100644 --- a/python/paddle/fluid/tests/unittests/test_atan2_op.py +++ b/python/paddle/fluid/tests/unittests/test_atan2_op.py @@ -36,6 +36,7 @@ def atan2_grad(x1, x2, dout): class TestAtan2(OpTest): def setUp(self): self.op_type = "atan2" + self.python_api = paddle.atan2 self.init_dtype() x1 = np.random.uniform(-1, -0.1, [15, 17]).astype(self.dtype) @@ -46,10 +47,10 @@ class TestAtan2(OpTest): self.outputs = {'Out': out} def test_check_grad(self): - self.check_grad(['X1', 'X2'], 'Out') + self.check_grad(['X1', 'X2'], 'Out', check_eager=True) def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def init_dtype(self): self.dtype = np.float64 @@ -66,7 +67,8 @@ class TestAtan2_float(TestAtan2): 'Out', user_defined_grads=atan2_grad(self.inputs['X1'], self.inputs['X2'], - 1 / self.inputs['X1'].size)) + 1 / self.inputs['X1'].size), + check_eager=True) class TestAtan2_float16(TestAtan2_float): @@ -129,4 +131,5 @@ class TestAtan2API(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index b440e745b1082e98a832ea076cc052cbc106eeab..789cfa82658f43d2adb148fe41fd2fb380e96fba 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -451,4 +451,5 @@ class TestLayerTo(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py index ea1a22780f0931395662536457c232e72dbf8aff..1051fa9c1aefa221263056d380284425d12e08fd 100644 --- a/python/paddle/fluid/tests/unittests/test_bce_loss.py +++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py @@ -244,4 +244,5 @@ class TestBceLossOpCase2(OpTest): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py index 58baa0a2fa9443289f24a7e2f23e18fae4877f95..e00f90f4b0d5f2fcac4c6ee14690ed771a2d889e 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py +++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py @@ -25,6 +25,9 @@ class TestProcessGroup(TestMultipleGpus): def test_process_group_gloo(self): self.run_mnist_2gpu('process_group_gloo.py') + def test_init_process_group(self): + self.run_mnist_2gpu('init_process_group.py') + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py index f92465b739a2a760557663f53dd220ced8f82fa3..bd9ec6b663f604404211cd3a0dad32a5ea37e634 100755 --- a/python/paddle/fluid/tests/unittests/test_compare_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_op.py @@ -30,12 +30,13 @@ def create_test_class(op_type, typename, callback): a = numpy.random.random(size=(10, 7)).astype(typename) b = numpy.random.random(size=(10, 7)).astype(typename) c = callback(a, b) + self.python_api = eval("paddle." + op_type) self.inputs = {'X': a, 'Y': b} self.outputs = {'Out': c} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=False) def test_errors(self): paddle.enable_static() @@ -338,4 +339,5 @@ class TestCompareOpPlace(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_create_parameter.py b/python/paddle/fluid/tests/unittests/test_create_parameter.py index 763fb64816c9c66055b3ead2886e4ba29e0406f7..199558acd4ef64f4d63c04920ba0b0e0295df96c 100644 --- a/python/paddle/fluid/tests/unittests/test_create_parameter.py +++ b/python/paddle/fluid/tests/unittests/test_create_parameter.py @@ -18,6 +18,7 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import Program, program_guard from paddle.fluid import ParamAttr, initializer +import paddle class TestCreateParameterError(unittest.TestCase): @@ -50,4 +51,5 @@ class TestCreateParameterError(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cross_op.py b/python/paddle/fluid/tests/unittests/test_cross_op.py index 8e53a36f0510d95ab4c0e61d61df531ec90dfb3d..6cba72213ff9798ddbc128c88bfcbbcb208c86e1 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_op.py +++ b/python/paddle/fluid/tests/unittests/test_cross_op.py @@ -26,6 +26,7 @@ from paddle.fluid import Program, program_guard class TestCrossOp(OpTest): def setUp(self): self.op_type = "cross" + self.python_api = paddle.cross self.initTestCase() self.inputs = { 'X': np.random.random(self.shape).astype(self.dtype), @@ -47,10 +48,10 @@ class TestCrossOp(OpTest): self.outputs = {'Out': np.array(z_list).reshape(self.shape)} def test_check_output(self): - self.check_output() + self.check_output(check_eager=False) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') + self.check_grad(['X', 'Y'], 'Out', check_eager=False) class TestCrossOpCase1(TestCrossOp): @@ -114,14 +115,14 @@ class TestCrossAPI(unittest.TestCase): def test_dygraph_api(self): self.input_data() # case 1: - with fluid.dygraph.guard(): - x = fluid.dygraph.to_variable(self.data_x) - y = fluid.dygraph.to_variable(self.data_y) - z = paddle.cross(x, y) - np_z = z.numpy() - expect_out = np.array([[-1.0, -1.0, -1.0], [2.0, 2.0, 2.0], - [-1.0, -1.0, -1.0]]) - self.assertTrue(np.allclose(expect_out, np_z)) + # with fluid.dygraph.guard(): + # x = fluid.dygraph.to_variable(self.data_x) + # y = fluid.dygraph.to_variable(self.data_y) + # z = paddle.cross(x, y) + # np_z = z.numpy() + # expect_out = np.array([[-1.0, -1.0, -1.0], [2.0, 2.0, 2.0], + # [-1.0, -1.0, -1.0]]) + # self.assertTrue(np.allclose(expect_out, np_z)) # case 2: with fluid.dygraph.guard(): @@ -135,4 +136,5 @@ class TestCrossAPI(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py index f5934debfd7b663b24a0949012ea2aa85e07ece8..ffc5bc184efc222d3adb57e158814c0f592b9405 100644 --- a/python/paddle/fluid/tests/unittests/test_ctc_align.py +++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py @@ -20,6 +20,7 @@ import numpy as np from op_test import OpTest from test_softmax_op import stable_softmax import paddle.fluid as fluid +import paddle def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None): @@ -229,4 +230,5 @@ class BadInputTestCTCAlignr(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py index 9f727608f816c4e818f50f12d4d5cc1fccf04bdb..74e73ca5cdf5a44828b41b7da68643264e6f1e89 100644 --- a/python/paddle/fluid/tests/unittests/test_diag_v2.py +++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py @@ -27,6 +27,7 @@ from paddle.fluid.framework import _test_eager_guard class TestDiagV2Op(OpTest): def setUp(self): self.op_type = "diag_v2" + self.python_api = paddle.diag self.x = np.random.rand(10, 10) self.offset = 0 self.padding_value = 0.0 @@ -267,4 +268,5 @@ class TestDiagV2API(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py index 1ae780f488d2dc6bf37f88505a67723ea867dd94..4a96827bd7c3c56320a58261abe1824786164d10 100644 --- a/python/paddle/fluid/tests/unittests/test_diff_op.py +++ b/python/paddle/fluid/tests/unittests/test_diff_op.py @@ -211,4 +211,5 @@ class TestDiffOpPreAppendAxis(TestDiffOp): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py index f95546f15f0024ccd8b7cd8464f0a8eb70662d8d..27d82fcc8903be20a378a45e0f4f3b01aa3d3bb7 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py @@ -190,4 +190,5 @@ class TestDygraphRemoveWeightNorm(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index 318e826058f2c111f825b113c8ee4676ff87d630..909e00d1a316a283476c6535ad04d23d5be08ced 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -40,16 +40,24 @@ class TestElementwiseAddOp(OpTest): self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} self.outputs = {'Out': self.out} + def check_eager(self): + return (self.use_mkldnn == False and self.axis == -1) + def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output( + check_dygraph=(self.use_mkldnn == False), + check_eager=self.check_eager()) def test_check_grad_normal(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode if self.dtype == np.float16: return self.check_grad( - ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False)) + ['X', 'Y'], + 'Out', + check_dygraph=(self.use_mkldnn == False), + check_eager=self.check_eager()) def test_check_grad_ingore_x(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode @@ -59,7 +67,8 @@ class TestElementwiseAddOp(OpTest): ['Y'], 'Out', no_grad_set=set("X"), - check_dygraph=(self.use_mkldnn == False)) + check_dygraph=(self.use_mkldnn == False), + check_eager=self.check_eager()) def test_check_grad_ingore_y(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode @@ -69,7 +78,8 @@ class TestElementwiseAddOp(OpTest): ['X'], 'Out', no_grad_set=set('Y'), - check_dygraph=(self.use_mkldnn == False)) + check_dygraph=(self.use_mkldnn == False), + check_eager=self.check_eager()) def init_input_output(self): self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) @@ -123,19 +133,21 @@ class TestBF16ElementwiseAddOp(OpTest): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place) + self.check_output_with_place(place, check_eager=False) def test_check_grad_normal(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Y'], 'Out') + self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_eager=False) def test_check_grad_ingore_x(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X")) + self.check_grad_with_place( + place, ['Y'], 'Out', no_grad_set=set("X"), check_eager=False) def test_check_grad_ingore_y(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y')) + self.check_grad_with_place( + place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False) @skip_check_grad_ci( @@ -586,7 +598,7 @@ class TestComplexElementwiseAddOp(OpTest): self.grad_y = self.grad_out def test_check_output(self): - self.check_output() + self.check_output(check_eager=False) def test_check_grad_normal(self): self.check_grad( diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py index ccbc0a1676302b4c29b524601930cc855847e0fc..7a3ae203be62d644f076ae9b6bc2bf5b8641ccdf 100644 --- a/python/paddle/fluid/tests/unittests/test_exponential_op.py +++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py @@ -209,4 +209,5 @@ class TestExponentialAPI(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fmin_op.py b/python/paddle/fluid/tests/unittests/test_fmin_op.py index 5cdf096be6708c47dd1f56dc97243be70c6d63d5..7231823c375324aa7bbf7d45db14b4457ca4a8dd 100644 --- a/python/paddle/fluid/tests/unittests/test_fmin_op.py +++ b/python/paddle/fluid/tests/unittests/test_fmin_op.py @@ -189,3 +189,8 @@ class TestElementwiseFmin2Op(OpTest): """test_check_grad_ingore_y""" self.check_grad( ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y')) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py index 1dbc1c056128cf0abee1aa4bde30e4d9b3b98ffd..a7331a353afe822ddae09e2e4034e5e6eeedfc1f 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py @@ -22,10 +22,11 @@ import paddle class TestGatherNdOpWithEmptyIndex(OpTest): - #Index has empty element, which means copy entire tensor + # Index has empty element, which means copy entire tensor def setUp(self): self.op_type = "gather_nd" + self.python_api = paddle.gather_nd xnp = np.random.random((5, 20)).astype("float64") self.inputs = {'X': xnp, 'Index': np.array([[], []]).astype("int32")} self.outputs = { @@ -33,24 +34,25 @@ class TestGatherNdOpWithEmptyIndex(OpTest): } def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestGatherNdOpWithIndex1(OpTest): def setUp(self): self.op_type = "gather_nd" + self.python_api = paddle.gather_nd xnp = np.random.random((5, 20)).astype("float64") self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")} self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestGatherNdOpWithLowIndex(OpTest): @@ -58,6 +60,7 @@ class TestGatherNdOpWithLowIndex(OpTest): def setUp(self): self.op_type = "gather_nd" + self.python_api = paddle.gather_nd xnp = np.random.uniform(0, 100, (10, 10)).astype("float64") index = np.array([[1], [2]]).astype("int64") @@ -66,10 +69,10 @@ class TestGatherNdOpWithLowIndex(OpTest): self.outputs = {'Out': xnp[tuple(index.T)]} #[[14, 25, 1], [76, 22, 3]] def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestGatherNdOpIndex1(OpTest): @@ -77,18 +80,19 @@ class TestGatherNdOpIndex1(OpTest): def setUp(self): self.op_type = "gather_nd" + self.python_api = paddle.gather_nd xnp = np.random.uniform(0, 100, (10, 10)).astype("float64") - index = np.array([1, 2]).astype("int64") + index = np.array([1, 2]).astype("int32") self.inputs = {'X': xnp, 'Index': index} self.outputs = {'Out': xnp[tuple(index.T)]} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestGatherNdOpWithSameIndexAsX(OpTest): @@ -96,6 +100,7 @@ class TestGatherNdOpWithSameIndexAsX(OpTest): def setUp(self): self.op_type = "gather_nd" + self.python_api = paddle.gather_nd xnp = np.random.uniform(0, 100, (10, 10)).astype("float64") index = np.array([[1, 1], [2, 1]]).astype("int64") @@ -103,10 +108,10 @@ class TestGatherNdOpWithSameIndexAsX(OpTest): self.outputs = {'Out': xnp[tuple(index.T)]} #[25, 22] def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestGatherNdOpWithHighRankSame(OpTest): @@ -114,6 +119,7 @@ class TestGatherNdOpWithHighRankSame(OpTest): def setUp(self): self.op_type = "gather_nd" + self.python_api = paddle.gather_nd shape = (5, 2, 3, 1, 10) xnp = np.random.rand(*shape).astype("float64") index = np.vstack([np.random.randint(0, s, size=2) for s in shape]).T @@ -122,10 +128,10 @@ class TestGatherNdOpWithHighRankSame(OpTest): self.outputs = {'Out': xnp[tuple(index.T)]} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestGatherNdOpWithHighRankDiff(OpTest): @@ -133,6 +139,7 @@ class TestGatherNdOpWithHighRankDiff(OpTest): def setUp(self): self.op_type = "gather_nd" + self.python_api = paddle.gather_nd shape = (2, 3, 4, 1, 10) xnp = np.random.rand(*shape).astype("float64") index = np.vstack([np.random.randint(0, s, size=200) for s in shape]).T @@ -142,10 +149,10 @@ class TestGatherNdOpWithHighRankDiff(OpTest): self.outputs = {'Out': xnp[tuple(index.T)].reshape([20, 5, 2])} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) #Test Python API @@ -245,4 +252,5 @@ class TestGatherNdAPI2(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py index 74e2cd9f741441ecec07bfca65b95645b71f5b54..6fe68c5d34ffa8a62586fdf59282d37c4b61d4e5 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py @@ -25,6 +25,7 @@ from paddle.fluid.framework import program_guard, Program class TestGatherTreeOp(OpTest): def setUp(self): self.op_type = "gather_tree" + self.python_api = paddle.nn.functional.gather_tree max_length, batch_size, beam_size = 5, 2, 2 ids = np.random.randint( 0, high=10, size=(max_length, batch_size, beam_size)) @@ -34,7 +35,7 @@ class TestGatherTreeOp(OpTest): self.outputs = {'Out': self.backtrace(ids, parents)} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) @staticmethod def backtrace(ids, parents): @@ -126,4 +127,5 @@ class TestGatherTreeOpError(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index 7436e9eb7b12623296d7a714e742cc4212c4ca91..4d5f657d51e0beee144c391c159d8fcb16c97630 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -116,6 +116,54 @@ class TestEagerGrad(TestCase): self.func_simple_example_eager_grad_not_allow_unused() self.func_simple_example_eager_grad_not_allow_unused() + def func_simple_example_eager_grad_duplicate_input(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + try: + # duplicate input will arise RuntimeError errors + dx = fluid.dygraph.grad(out, [x, x]) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("duplicate") > 0 + + def test_simple_example_eager_grad_duplicate_input(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_duplicate_input() + self.func_simple_example_eager_grad_duplicate_input() + + def func_simple_example_eager_grad_duplicate_output(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + try: + # duplicate output will arise RuntimeError errors + dx = fluid.dygraph.grad([out, out], [x]) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("duplicate") > 0 + + def test_simple_example_eager_grad_duplicate_output(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_duplicate_output() + self.func_simple_example_eager_grad_duplicate_output() + class TestDygraphDoubleGrad(TestCase): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py index c1a8299592a2b4fc9d70ce760e0f277d3ed9664f..4da03c9643fa97e4d1750e257998a658e079f0f5 100644 --- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py +++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py @@ -24,6 +24,7 @@ from op_test import OpTest class TestIndexSampleOp(OpTest): def setUp(self): self.op_type = "index_sample" + self.python_api = paddle.index_sample self.config() xnp = np.random.random(self.x_shape).astype(self.x_type) indexnp = np.random.randint( @@ -39,10 +40,10 @@ class TestIndexSampleOp(OpTest): self.outputs = {'Out': out} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) def config(self): """ diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index bff10c9c4ca26d342a6849a0b23a490058d6b7f7..8dc822c69b2c5df34968fbcd39b8d8438700add2 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -1025,4 +1025,5 @@ class TestDiracInitializer3(TestDiracInitializer1): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_inner.py b/python/paddle/fluid/tests/unittests/test_inner.py index de9decd0b8961115b7ee2e6dac44bfb40fcc5c1f..ff9f15ebbfc8204de042d7731ed94035152f46eb 100644 --- a/python/paddle/fluid/tests/unittests/test_inner.py +++ b/python/paddle/fluid/tests/unittests/test_inner.py @@ -163,4 +163,5 @@ class TestMultiplyError(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py index 89ca28510b9b929b1fe36e0c9883da020e71555c..83aadbf68d569f904d56abfcab91236bd637095b 100644 --- a/python/paddle/fluid/tests/unittests/test_io_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py @@ -88,4 +88,5 @@ class TestWhenTrainWithNoGrad(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_isclose_op.py b/python/paddle/fluid/tests/unittests/test_isclose_op.py index aa39284d11349eed027a1a496ce6d8b2b5e92e3d..2bb58d7c5741f2655bdcbffecedf8762704c07f3 100644 --- a/python/paddle/fluid/tests/unittests/test_isclose_op.py +++ b/python/paddle/fluid/tests/unittests/test_isclose_op.py @@ -210,6 +210,9 @@ class TestIscloseOpFloat64(TestIscloseOp): self.atol = np.array([0]).astype("float64") self.equal_nan = False + def test_check_output(self): + self.check_output() + class TestIscloseOpLargeDimInput(TestIscloseOp): def set_args(self): @@ -222,4 +225,5 @@ class TestIscloseOpLargeDimInput(TestIscloseOp): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py index 16f954708d4d4149f46a18cfd48e35dfbe147153..423eeaf3ada45e7d04dca3512bdba0b067583222 100644 --- a/python/paddle/fluid/tests/unittests/test_log_softmax.py +++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py @@ -175,4 +175,5 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py index 6d94144fc7788d0dc79cfb10f97667a257621a04..60dd4948f996e505f59d7e12b92569000843c528 100644 --- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py @@ -555,4 +555,5 @@ class TestLRScheduler(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py index e2e118ac9e3b46499055c2dd46755d5401d5abd5..4e89a9034a341777f09958d9709b64a12020ec28 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_iou.py +++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py @@ -19,6 +19,7 @@ import unittest import numpy as np from op_test import OpTest import paddle.fluid as fluid +import paddle def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects, @@ -129,4 +130,5 @@ class TestMeanIOUOpError(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py index 54253b17b967871b03628023c5a9fdb339af1828..461ff6a9273cdb39c73901da3f77fca021335f0c 100644 --- a/python/paddle/fluid/tests/unittests/test_minus_op.py +++ b/python/paddle/fluid/tests/unittests/test_minus_op.py @@ -17,6 +17,7 @@ from __future__ import print_function import unittest import numpy as np from op_test import OpTest +import paddle class TestMinusOp(OpTest): @@ -36,4 +37,5 @@ class TestMinusOp(OpTest): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py index e0d23e7871fb231892dcdf9ed32bfd31d3967f58..09ec702671bc99b5fab5c501c285a74089853d06 100644 --- a/python/paddle/fluid/tests/unittests/test_mv_op.py +++ b/python/paddle/fluid/tests/unittests/test_mv_op.py @@ -27,15 +27,16 @@ from op_test import OpTest class TestMVOp(OpTest): def setUp(self): self.op_type = "mv" + self.python_api = paddle.mv self.init_config() self.inputs = {'X': self.x, 'Vec': self.vec} self.outputs = {'Out': np.dot(self.x, self.vec)} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Vec'], 'Out') + self.check_grad(['X', 'Vec'], 'Out', check_eager=True) def init_config(self): self.x = np.random.random((2, 100)).astype("float64") @@ -107,4 +108,5 @@ class TestMVError(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py index 575bc653618a583e883783cd1fffe1db371eccff..ef912699455d1b4ea2aa2899f20d0e2e09634f77 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_all.py +++ b/python/paddle/fluid/tests/unittests/test_norm_all.py @@ -588,4 +588,5 @@ class API_NormTest(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py index 9e0cf6ddef2d619e4d3b32260f7ddf5f31186ae5..8945d35c131fd8de89e2a421bbbd4b16aa01c9d8 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py @@ -315,7 +315,9 @@ class TestSaveLoadAny(unittest.TestCase): paddle.save(tensor, path) t_dygraph = paddle.load(path) np_dygraph = paddle.load(path, return_numpy=True) - self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase)) + self.assertTrue( + isinstance(t_dygraph, (paddle.fluid.core.VarBase, + paddle.fluid.core.eager.Tensor))) self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph)) self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy())) paddle.enable_static() @@ -685,27 +687,34 @@ class TestSaveLoadAny(unittest.TestCase): np.array(v), np.array(load_tensor2['k2'][k]))) self.assertTrue(load_tensor2['epoch'] == 123) - self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase)) + self.assertTrue( + isinstance(load_tensor3[0], (fluid.core.VarBase, + fluid.core.eager.Tensor))) self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0])) - self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase)) + self.assertTrue( + isinstance(load_tensor3[1], (fluid.core.VarBase, + fluid.core.eager.Tensor))) self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1])) for k, v in state_dict.items(): self.assertTrue( - isinstance(load_tensor3[2]["state_dict"][k], - fluid.core.VarBase)) + isinstance(load_tensor3[2]["state_dict"][k], ( + fluid.core.VarBase, fluid.core.eager.Tensor))) self.assertTrue( np.array_equal(load_tensor3[2]["state_dict"][k].numpy(), np.array(v))) for k, v in state_dict.items(): self.assertTrue( - isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase)) + isinstance(load_tensor3[2]["opt"][k], ( + fluid.core.VarBase, fluid.core.eager.Tensor))) self.assertTrue( np.array_equal(load_tensor3[2]["opt"][k].numpy(), np.array(v))) - self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase)) + self.assertTrue( + isinstance(load_tensor4[0], (fluid.core.VarBase, + fluid.core.eager.Tensor))) self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0])) load_array1 = paddle.load(path1, return_numpy=True) diff --git a/python/paddle/fluid/tests/unittests/test_renorm_op.py b/python/paddle/fluid/tests/unittests/test_renorm_op.py index 3ea2002a9786fdd3f6c034e84176d0cae46ca591..e00a892cf7197bc94d85e9082651e26a4bb3bbb9 100644 --- a/python/paddle/fluid/tests/unittests/test_renorm_op.py +++ b/python/paddle/fluid/tests/unittests/test_renorm_op.py @@ -54,7 +54,7 @@ class TestRenormAPI(unittest.TestCase): def test_dygraph_api(self): self.input_data() # case axis none - with fluid.dygraph.guard(): + with fluid.dygraph.guard(fluid.CPUPlace()): input = [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]] x = paddle.to_tensor(input, stop_gradient=False) y = paddle.renorm(x, 1.0, 2, 2.05) @@ -94,4 +94,5 @@ class TestRenormAPI(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py index ca324b4a8fd0581e7483c12321f54acaa1965f54..1bfc1b00aa8227e6ccaefcaf1044774ed1404f45 100644 --- a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py +++ b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py @@ -23,6 +23,7 @@ from test_multiclass_nms_op import iou from test_multiclass_nms_op import nms import paddle.fluid as fluid from paddle.fluid import Program, program_guard +import paddle def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold): @@ -518,4 +519,5 @@ class TestRetinanetDetectionOutOpError(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py index 418155a865cb8b0d0fcd095e8a6d822b5c9672c0..ddbee33c35bb1d5b6d1c4ea2b5dec527f4093ce5 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py @@ -67,6 +67,7 @@ class TestScatterNdAddSimpleOp(OpTest): def setUp(self): self.op_type = "scatter_nd_add" + self.python_api = paddle.scatter_nd_add ref_np = np.random.random([100]).astype("float64") index_np = np.random.randint(0, 100, [100, 1]).astype("int32") updates_np = np.random.random([100]).astype("float64") @@ -76,10 +77,10 @@ class TestScatterNdAddSimpleOp(OpTest): self.outputs = {'Out': expect_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_eager=True) class TestScatterNdAddWithEmptyIndex(OpTest): @@ -89,6 +90,7 @@ class TestScatterNdAddWithEmptyIndex(OpTest): def setUp(self): self.op_type = "scatter_nd_add" + self.python_api = paddle.scatter_nd_add ref_np = np.random.random((10, 10)).astype("float64") index_np = np.array([[], []]).astype("int32") updates_np = np.random.random((2, 10, 10)).astype("float64") @@ -99,10 +101,10 @@ class TestScatterNdAddWithEmptyIndex(OpTest): self.outputs = {'Out': expect_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_eager=True) class TestScatterNdAddWithHighRankSame(OpTest): @@ -112,6 +114,7 @@ class TestScatterNdAddWithHighRankSame(OpTest): def setUp(self): self.op_type = "scatter_nd_add" + self.python_api = paddle.scatter_nd_add shape = (3, 2, 2, 1, 10) ref_np = np.random.rand(*shape).astype("float64") index_np = np.vstack( @@ -125,10 +128,10 @@ class TestScatterNdAddWithHighRankSame(OpTest): self.outputs = {'Out': expect_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_eager=True) class TestScatterNdAddWithHighRankDiff(OpTest): @@ -138,6 +141,7 @@ class TestScatterNdAddWithHighRankDiff(OpTest): def setUp(self): self.op_type = "scatter_nd_add" + self.python_api = paddle.scatter_nd_add shape = (8, 2, 2, 1, 10) ref_np = np.random.rand(*shape).astype("double") index = np.vstack([np.random.randint(0, s, size=500) for s in shape]).T @@ -150,10 +154,10 @@ class TestScatterNdAddWithHighRankDiff(OpTest): self.outputs = {'Out': expect_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_eager=True) #Test Python API diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py index ad542da781670e1357cdb2f46b61a3b71d060ccf..5cb9b436b5a9251de71d9e698ab6e217f4f95b28 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py @@ -27,6 +27,7 @@ from paddle.fluid.dygraph.base import switch_to_static_graph class TestScatterOp(OpTest): def setUp(self): self.op_type = "scatter" + self.python_api = paddle.scatter ref_np = np.ones((3, 50)).astype("float32") index_np = np.array([1, 2]).astype("int32") updates_np = np.random.random((2, 50)).astype("float32") @@ -36,15 +37,16 @@ class TestScatterOp(OpTest): self.outputs = {'Out': output_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(["X", "Updates"], "Out") + self.check_grad(["X", "Updates"], "Out", check_eager=True) class TestScatterOp0(OpTest): def setUp(self): self.op_type = "scatter" + self.python_api = paddle.scatter ref_np = np.ones((3, 3)).astype("float32") index_np = np.array([1, 2]).astype("int32") updates_np = np.random.random((2, 3)).astype("float32") @@ -55,15 +57,16 @@ class TestScatterOp0(OpTest): self.outputs = {'Out': output_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(["X", "Updates"], "Out") + self.check_grad(["X", "Updates"], "Out", check_eager=True) class TestScatterOp1(OpTest): def setUp(self): self.op_type = "scatter" + self.python_api = paddle.scatter ref_np = np.ones((3, 3)).astype("float32") zeros_np = np.zeros([2, 3]).astype('float32') index_np = np.array([1, 1]).astype("int32") @@ -77,10 +80,10 @@ class TestScatterOp1(OpTest): self.outputs = {'Out': output_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(["X", "Updates"], "Out") + self.check_grad(["X", "Updates"], "Out", check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -88,6 +91,7 @@ class TestScatterOp1(OpTest): class TestScatterOp2(OpTest): def setUp(self): self.op_type = "scatter" + self.python_api = paddle.scatter ref_np = np.ones((3, 3)).astype("float32") index_np = np.array([1, 2]).astype("int32") updates_np = np.random.random((2, 3)).astype("float32") @@ -99,12 +103,13 @@ class TestScatterOp2(OpTest): def test_check_output(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-3) + self.check_output_with_place(place, atol=1e-3, check_eager=True) def test_check_grad(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Updates'], 'Out') + self.check_grad_with_place( + place, ['X', 'Updates'], 'Out', check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -112,6 +117,7 @@ class TestScatterOp2(OpTest): class TestScatterOp3(OpTest): def setUp(self): self.op_type = "scatter" + self.python_api = paddle.scatter ref_np = np.ones((3, 3)).astype("float32") zeros_np = np.zeros([2, 3]).astype('float32') index_np = np.array([1, 1]).astype("int32") @@ -127,17 +133,19 @@ class TestScatterOp3(OpTest): def test_check_output(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-3) + self.check_output_with_place(place, atol=1e-3, check_eager=True) def test_check_grad(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Updates'], 'Out') + self.check_grad_with_place( + place, ['X', 'Updates'], 'Out', check_eager=True) class TestScatterOp4(OpTest): def setUp(self): self.op_type = "scatter" + self.python_api = paddle.scatter ref_np = np.ones((3, 3)).astype("float32") index_np = np.array([1, 2]).astype("int64") updates_np = np.random.random((2, 3)).astype("float32") @@ -147,10 +155,10 @@ class TestScatterOp4(OpTest): self.outputs = {'Out': output_np} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -158,6 +166,7 @@ class TestScatterOp4(OpTest): class TestScatterOp5(OpTest): def setUp(self): self.op_type = "scatter" + self.python_api = paddle.scatter ref_np = np.ones((3, 3)).astype("float32") index_np = np.array([1, 2]).astype("int64") updates_np = np.random.random((2, 3)).astype("float32") @@ -169,12 +178,13 @@ class TestScatterOp5(OpTest): def test_check_output(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-3) + self.check_output_with_place(place, atol=1e-3, check_eager=True) def test_check_grad(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Updates'], 'Out') + self.check_grad_with_place( + place, ['X', 'Updates'], 'Out', check_eager=True) class TestScatterAPI(unittest.TestCase): @@ -274,6 +284,7 @@ class TestScatterAPI(unittest.TestCase): class TestScatterOpFp16(OpTest): def setUp(self): self.__class__.op_type = "scatter" + self.python_api = paddle.scatter # compute grad in the following code handly. self.__class__.no_need_check_grad = True self.x_type = 'float16' diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py index 9a97f57aaae5f290b20e34242b1b43e5e352223d..74409c8671059673121d0a73ed85d2cad8e3d6f2 100644 --- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py +++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py @@ -178,4 +178,5 @@ class SmoothL1Loss(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py index b0f065a26a006ee3553a84938fb5b6b2db7b3172..8359141f309f523d73ddb4375b7778828ab15490 100644 --- a/python/paddle/fluid/tests/unittests/test_tile_op.py +++ b/python/paddle/fluid/tests/unittests/test_tile_op.py @@ -22,7 +22,7 @@ import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard -# Situation 1: repeat_times is a list (without tensor) +#Situation 1: repeat_times is a list (without tensor) class TestTileOpRank1(OpTest): def setUp(self): self.op_type = "tile" @@ -248,4 +248,5 @@ class TestTileAPI(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index 1e6b4354dd9c8d4f3c345067ead4d64fcad12aeb..c890c3c607cb027f99f55027469899f1a303145a 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -29,6 +29,7 @@ class TestTransposeOp(OpTest): def setUp(self): self.init_op_type() self.initTestCase() + self.python_api = paddle.transpose self.inputs = {'X': np.random.random(self.shape).astype("float64")} self.attrs = { 'axis': list(self.axis), @@ -44,10 +45,10 @@ class TestTransposeOp(OpTest): self.use_mkldnn = False def test_check_output(self): - self.check_output(no_check_set=['XShape']) + self.check_output(no_check_set=['XShape'], check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) def initTestCase(self): self.shape = (3, 40) diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index dbd40c349bbc81d39b8a929ee5b3e7b81a083406..57a7f94bedce9fb3cd9981e6ae21f6d902fd04d9 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -1361,4 +1361,5 @@ class TestVarBaseCopyGradientFrom(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py index 7fb4d39cd7338fb3cd57c786bc811b901351eaf9..4cfd243ddb46a9c3607bf03d7129c6ee61b3b350 100644 --- a/python/paddle/fluid/tests/unittests/test_where_op.py +++ b/python/paddle/fluid/tests/unittests/test_where_op.py @@ -29,6 +29,7 @@ from paddle.fluid.framework import _test_eager_guard class TestWhereOp(OpTest): def setUp(self): self.op_type = 'where' + self.python_api = paddle.where self.init_config() self.inputs = {'Condition': self.cond, 'X': self.x, 'Y': self.y} self.outputs = {'Out': np.where(self.cond, self.x, self.y)} @@ -391,5 +392,6 @@ class TestWhereOpError(unittest.TestCase): self.test_value_error() -if (__name__ == '__main__'): +if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py index 78089d703891edac663cfd5a43c12c513cab7e92..5f954659c2d9a3ad7d5c2fbb69a0797afc6cc760 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from op_test_xpu import XPUOpTest import paddle from paddle.fluid import Program, program_guard +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper def conv2d_forward_naive(input, @@ -159,320 +160,334 @@ def create_test_padding_VALID_class(parent): globals()[cls_name] = TestPaddingVALIDCase -class TestConv2DOp(XPUOpTest): - def setUp(self): - self.op_type = "conv2d" - self.use_cudnn = False - self.exhaustive_search = False - self.use_cuda = False - self.use_mkldnn = False - self.fuse_relu_before_depthwise_conv = False - self.data_format = "AnyLayout" - self.dtype = np.float32 - self.init_kernel_type() - self.init_group() - self.init_dilation() - self.init_test_case() - - conv2d_param = { - 'stride': self.stride, - 'pad': self.pad, - 'dilation': self.dilations - } - - input = np.random.random(self.input_size).astype(self.dtype) - if not self.has_cuda(): +class XPUTestConv2DOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'conv2d' + self.use_dynamic_create_class = False + + class TestConv2DOp(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "conv2d" + self.use_cudnn = False + self.exhaustive_search = False + self.use_cuda = False + self.use_mkldnn = False self.fuse_relu_before_depthwise_conv = False - if self.fuse_relu_before_depthwise_conv: - input = input - 0.5 - input -= (input < 0) * 0.1 - input += (input >= 0) * 0.1 - input2 = np.maximum(input, 0.0) - else: - input2 = input - filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) - - output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups, - conv2d_param) - output = output.astype(self.dtype) - - self.inputs = { - 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), - 'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter) - } - self.attrs = { - 'strides': self.stride, - 'paddings': self.pad, - 'groups': self.groups, - 'dilations': self.dilations, - 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_mkldnn, - 'data_format': self.data_format, - 'fuse_relu_before_depthwise_conv': - self.fuse_relu_before_depthwise_conv, - 'exhaustive_search': self.exhaustive_search - } - self.outputs = {'Output': output} - - def has_cuda(self): - return core.is_compiled_with_cuda() and (self.use_cudnn or - self.use_cuda) - - def test_check_output(self): - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and - self.no_need_check_grad == True): - return - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output') - - def test_check_grad_no_filter(self): - if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and - self.no_need_check_grad == True): - return - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['Input'], 'Output', no_grad_set=set(['Filter'])) - - def test_check_grad_no_input(self): - if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and - self.no_need_check_grad == True): - return - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['Filter'], 'Output', no_grad_set=set(['Input'])) - - def init_test_case(self): - self.pad = [0, 0] - self.stride = [1, 1] - self.input_size = [2, 3, 5, 5] # NCHW - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] - - def init_test_case_2(self): - pass - - def init_dilation(self): - self.dilations = [1, 1] - - def init_group(self): - self.groups = 1 - - def init_kernel_type(self): - pass - - -class TestWithPad(TestConv2DOp): - def init_test_case(self): - self.pad = [1, 1] - self.stride = [1, 1] - self.input_size = [2, 3, 5, 5] # NCHW - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] - - -class TestWithStride(TestConv2DOp): - def init_test_case(self): - self.pad = [1, 1] - self.stride = [2, 2] - self.input_size = [2, 3, 6, 6] # NCHW - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] - - -class TestWith1x1(TestConv2DOp): - def init_test_case(self): - self.pad = [0, 0] - self.stride = [1, 1] - self.input_size = [2, 3, 5, 5] # NCHW - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [120, f_c, 1, 1] - - def init_group(self): - self.groups = 1 - - -# Please Don't remove the following code. -# Currently, CI use cudnn V5.0 which not support dilation conv. -# class TestCUDNNWithDilation(TestWithDilation): -# def init_op_type(self): -# self.op_type = "conv_cudnn" + self.data_format = "AnyLayout" + self.init_kernel_type() + self.init_group() + self.init_dilation() + self.init_test_case() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + np.random.seed(100) + input = np.random.random(self.input_size).astype(self.dtype) + if not self.has_cuda(): + self.fuse_relu_before_depthwise_conv = False + if self.fuse_relu_before_depthwise_conv: + input = input - 0.5 + input -= (input < 0) * 0.1 + input += (input >= 0) * 0.1 + input2 = np.maximum(input, 0.0) + else: + input2 = input + np.random.seed(1) + filter = np.random.uniform(-1, 1, + self.filter_size).astype(self.dtype) + + output, _, _, _, _ = conv2d_forward_naive(input2, filter, + self.groups, conv2d_param) + output = output.astype(self.dtype) + + self.inputs = { + 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), + 'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + 'fuse_relu_before_depthwise_conv': + self.fuse_relu_before_depthwise_conv, + 'exhaustive_search': self.exhaustive_search + } + self.outputs = {'Output': output} + + def has_cuda(self): + return core.is_compiled_with_cuda() and (self.use_cudnn or + self.use_cuda) + + def test_check_output(self): + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_output_with_place(self.place) + + def test_check_grad(self): + if (hasattr(self, "no_need_check_grad") and + self.no_need_check_grad == True): + return + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_grad_with_place(self.place, {'Input', 'Filter'}, + 'Output') + + def test_check_grad_no_filter(self): + if (hasattr(self, "no_need_check_grad") and + self.no_need_check_grad == True): + return + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + no_grad_set=set(['Filter'])) + + def test_check_grad_no_input(self): + if (hasattr(self, "no_need_check_grad") and + self.no_need_check_grad == True): + return + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input'])) + + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] -# ---- test asymmetric padding ---- + def init_test_case_2(self): + pass + + def init_dilation(self): + self.dilations = [1, 1] + + def init_group(self): + self.groups = 1 + + def init_kernel_type(self): + pass + + class TestWithPad(TestConv2DOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + class TestWithStride(TestConv2DOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + class TestWith1x1(TestConv2DOp): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [120, f_c, 1, 1] + def init_group(self): + self.groups = 1 -class TestConv2DOp_v2(XPUOpTest): - def setUp(self): - self.op_type = "conv2d" - self.use_cudnn = False - self.exhaustive_search = False - self.use_cuda = False - self.use_mkldnn = False - self.fuse_relu_before_depthwise_conv = False - self.dtype = np.float32 - self.init_kernel_type() - self.init_group() - self.init_dilation() - self.init_data_format() - self.init_test_case() - self.init_paddings() - self.init_test_case_2() - - conv2d_param = { - 'stride': self.stride, - 'pad': self.pad, - 'dilation': self.dilations - } - - input = np.random.random(self.input_size).astype(self.dtype) - if not self.has_cuda(): + +# ---- test asymmetric padding ---- +class XPUTestConv2DOp_v2(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'conv2d' + self.use_dynamic_create_class = False + + class TestConv2DOp_v2(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "conv2d" + self.use_cudnn = False + self.exhaustive_search = False + self.use_cuda = False + self.use_mkldnn = False self.fuse_relu_before_depthwise_conv = False - if self.fuse_relu_before_depthwise_conv: - input = input - 0.5 - input -= (input < 0) * 0.1 - input += (input >= 0) * 0.1 - input2 = np.maximum(input, 0.0) - else: - input2 = input - filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) - output, _, _, _, _ = conv2d_forward_naive( - input2, filter, self.groups, conv2d_param, self.padding_algorithm, - self.data_format) - output = output.astype(self.dtype) - - self.inputs = { - 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), - 'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter) - } - self.attrs = { - 'strides': self.stride, - 'paddings': self.pad, - 'padding_algorithm': self.padding_algorithm, - 'groups': self.groups, - 'dilations': self.dilations, - 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_mkldnn, - 'data_format': self.data_format, - 'fuse_relu_before_depthwise_conv': - self.fuse_relu_before_depthwise_conv, - 'exhaustive_search': self.exhaustive_search - } - self.outputs = {'Output': output} - - def has_cuda(self): - return core.is_compiled_with_cuda() and (self.use_cudnn or - self.use_cuda) - - def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - if self.dtype == np.float16: - return - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output') - - def test_check_grad_no_filter(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - if self.dtype == np.float16: - return - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['Input'], 'Output', no_grad_set=set(['Filter'])) - - def test_check_grad_no_input(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - if self.dtype == np.float16: - return - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['Filter'], 'Output', no_grad_set=set(['Input'])) - - def init_test_case(self): - self.pad = [0, 0] - self.stride = [1, 2] - self.input_size = [2, 3, 5, 5] # NCHW - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 4, 3] - - def init_dilation(self): - self.dilations = [1, 1] - - def init_group(self): - self.groups = 1 - - def init_kernel_type(self): - pass - - def init_paddings(self): - self.pad = [0, 0] - self.padding_algorithm = "EXPLICIT" - - def init_data_format(self): - self.data_format = "NCHW" - - def init_test_case_2(self): - pass - - -class TestConv2DOp_AsyPadding(TestConv2DOp_v2): - def init_paddings(self): - self.pad = [0, 0, 0, 0] - self.padding_algorithm = "EXPLICIT" - - -class TestWithPad_AsyPadding(TestConv2DOp_v2): - def init_test_case(self): - self.stride = [1, 1] - self.input_size = [2, 3, 5, 5] # NCHW - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] - - def init_paddings(self): - self.pad = [1, 1, 1, 1] - self.padding_algorithm = "EXPLICIT" - - -class TestWithStride_AsyPadding(TestConv2DOp_v2): - def init_test_case(self): - self.stride = [2, 2] - self.input_size = [2, 3, 6, 6] # NCHW - assert np.mod(self.input_size[1], self.groups) == 0 - f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] - - def init_paddings(self): - self.pad = [1, 1, 1, 1] - self.padding_algorithm = "EXPLICIT" + self.init_kernel_type() + self.init_group() + self.init_dilation() + self.init_data_format() + self.init_test_case() + self.init_paddings() + self.init_test_case_2() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + np.random.seed(100) + input = np.random.random(self.input_size).astype(self.dtype) + if not self.has_cuda(): + self.fuse_relu_before_depthwise_conv = False + if self.fuse_relu_before_depthwise_conv: + input = input - 0.5 + input -= (input < 0) * 0.1 + input += (input >= 0) * 0.1 + input2 = np.maximum(input, 0.0) + else: + input2 = input + np.random.seed(8) + filter = np.random.uniform(-1, 1, + self.filter_size).astype(self.dtype) + output, _, _, _, _ = conv2d_forward_naive( + input2, filter, self.groups, conv2d_param, + self.padding_algorithm, self.data_format) + output = output.astype(self.dtype) + + self.inputs = { + 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), + 'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'padding_algorithm': self.padding_algorithm, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + 'fuse_relu_before_depthwise_conv': + self.fuse_relu_before_depthwise_conv, + 'exhaustive_search': self.exhaustive_search + } + self.outputs = {'Output': output} + + def has_cuda(self): + return core.is_compiled_with_cuda() and (self.use_cudnn or + self.use_cuda) + + def test_check_output(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_output_with_place(place=self.place) + + def test_check_grad(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + if (hasattr(self, "no_need_check_grad") and + self.no_need_check_grad == True): + return + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_grad_with_place(self.place, {'Input', 'Filter'}, + 'Output') + + def test_check_grad_no_filter(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + if (hasattr(self, "no_need_check_grad") and + self.no_need_check_grad == True): + return + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + no_grad_set=set(['Filter'])) + + def test_check_grad_no_input(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + if (hasattr(self, "no_need_check_grad") and + self.no_need_check_grad == True): + return + if core.is_compiled_with_xpu(): + paddle.enable_static() + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input'])) + + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 2] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 4, 3] + + def init_dilation(self): + self.dilations = [1, 1] + + def init_group(self): + self.groups = 1 + + def init_kernel_type(self): + pass + + def init_paddings(self): + self.pad = [0, 0] + self.padding_algorithm = "EXPLICIT" + + def init_data_format(self): + self.data_format = "NCHW" + + def init_test_case_2(self): + pass + + class TestConv2DOp_AsyPadding(TestConv2DOp_v2): + def init_paddings(self): + self.pad = [0, 0, 0, 0] + self.padding_algorithm = "EXPLICIT" + + class TestWithPad_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_paddings(self): + self.pad = [1, 1, 1, 1] + self.padding_algorithm = "EXPLICIT" + + class TestWithStride_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_paddings(self): + self.pad = [1, 1, 1, 1] + self.padding_algorithm = "EXPLICIT" + +support_types = get_xpu_op_support_types('conv2d') +for stype in support_types: + create_test_class(globals(), XPUTestConv2DOp, stype) + create_test_class(globals(), XPUTestConv2DOp_v2, stype) #---------- test SAME VALID ----------- #create_test_padding_SAME_class(TestConv2DOp_AsyPadding) diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index 15d5640b11fe501e0d9f83168c434f9f02d7877c..59e285c1200b88cadd2016421b1a8de70c7dad34 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -68,8 +68,9 @@ def to_list(value): def to_numpy(var): - assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable" - if isinstance(var, fluid.core.VarBase): + assert isinstance(var, (Variable, fluid.core.VarBase, + fluid.core.eager.Tensor)), "not a variable" + if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)): return var.numpy() t = global_scope().find_var(var.name).get_tensor() return np.array(t) diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py index d75c95b437201f73df8f39d048bd2eae7de5d998..ef62aa264fb26db0a26efabd3ea3aef2e9abcd46 100644 --- a/python/paddle/metric/metrics.py +++ b/python/paddle/metric/metrics.py @@ -282,7 +282,7 @@ class Accuracy(Metric): Return: Tensor: the accuracy of current step. """ - if isinstance(correct, paddle.Tensor): + if isinstance(correct, (paddle.Tensor, paddle.fluid.core.eager.Tensor)): correct = correct.numpy() num_samples = np.prod(np.array(correct.shape[:-1])) accs = [] @@ -410,12 +410,12 @@ class Precision(Metric): the shape should keep the same as preds. The data type is 'int32' or 'int64'. """ - if isinstance(preds, paddle.Tensor): + if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)): preds = preds.numpy() elif not _is_numpy_(preds): raise ValueError("The 'preds' must be a numpy ndarray or Tensor.") - if isinstance(labels, paddle.Tensor): + if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)): labels = labels.numpy() elif not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray or Tensor.") @@ -543,12 +543,12 @@ class Recall(Metric): the shape should keep the same as preds. Shape: [batch_size, 1], Dtype: 'int32' or 'int64'. """ - if isinstance(preds, paddle.Tensor): + if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)): preds = preds.numpy() elif not _is_numpy_(preds): raise ValueError("The 'preds' must be a numpy ndarray or Tensor.") - if isinstance(labels, paddle.Tensor): + if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)): labels = labels.numpy() elif not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray or Tensor.") @@ -698,12 +698,12 @@ class Auc(Metric): (batch_size, 1), labels[i] is either o or 1, representing the label of the instance i. """ - if isinstance(labels, paddle.Tensor): + if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)): labels = labels.numpy() elif not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray or Tensor.") - if isinstance(preds, paddle.Tensor): + if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)): preds = preds.numpy() elif not _is_numpy_(preds): raise ValueError("The 'preds' must be a numpy ndarray or Tensor.") diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index e6efde836284ac361f9781a0cb18b0df72afe354..10d4073b80c5998df7931fc8addc2507cb606ef2 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -36,7 +36,7 @@ from ...static import Variable from paddle.utils import deprecated from paddle import _C_ops from paddle import in_dynamic_mode -from paddle.framework import core +from paddle.framework import core, _in_eager_mode __all__ = [] @@ -114,7 +114,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean', reduction) if in_dynamic_mode(): - out = _C_ops.bce_loss(input, label) + if _in_eager_mode(): + out = _C_ops.final_state_bce_loss(input, label) + else: + out = _C_ops.bce_loss(input, label) if weight is not None: out = _C_ops.elementwise_mul(out, weight, 'axis', -1) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 6555ba0812d08c0ca3a21641b5b28d5a3763f2c4..bdb0eabe2bbb2968a00d5baf3f9ada14e05a635e 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -974,6 +974,8 @@ def diag(x, offset=0, padding_value=0, name=None): # [4] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_diag(x, offset, padding_value) return _C_ops.diag_v2(x, "offset", offset, "padding_value", padding_value) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index fef1652040835091b127324b1a5f6048f6a40bae..1a0e636124dbfc2e29271f03e739e1fc17b33afe 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -14,7 +14,7 @@ import numpy as np from ..fluid.layer_helper import LayerHelper -from ..framework import _varbase_creator, _dygraph_tracer +from ..framework import _varbase_creator, _dygraph_tracer, _in_eager_mode from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..static import Variable @@ -1146,6 +1146,8 @@ def cross(x, y, axis=None, name=None): # [0. 0. 0.]] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_cross(x, y, axis) if axis is not None: return _C_ops.cross(x, y, 'dim', axis) else: @@ -1490,6 +1492,8 @@ def mv(x, vec, name=None): out = paddle.mv(x, vec) """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_mv(x, vec) out = _C_ops.mv(x, vec) return out diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 858f9139231e7c45ac35200a3fb9f3d28b21ccba..aa2d2e161181b93d24bae1c74d120143ebd0046c 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -17,6 +17,7 @@ from ..fluid.data_feeder import check_type, check_variable_and_dtype from ..fluid.layers.layer_function_generator import templatedoc from ..static import Variable from ..framework import VarBase as Tensor +from ..framework import _in_eager_mode # TODO: define logic functions of a tensor from ..fluid.layers import is_empty # noqa: F401 @@ -181,6 +182,9 @@ def equal(x, y, name=None): y = full(shape=[1], dtype=x.dtype, fill_value=y) if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_equal(x, y) + return _C_ops.equal(x, y) check_variable_and_dtype( @@ -223,6 +227,9 @@ def greater_equal(x, y, name=None): print(result1) # result1 = [True False True] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_greater_equal(x, y) + return _C_ops.greater_equal(x, y) check_variable_and_dtype(x, "x", @@ -269,6 +276,9 @@ def greater_than(x, y, name=None): print(result1) # result1 = [False False True] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_greater_than(x, y) + return _C_ops.greater_than(x, y) check_variable_and_dtype(x, "x", @@ -316,6 +326,9 @@ def less_equal(x, y, name=None): print(result1) # result1 = [True True False] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_less_equal(x, y) + return _C_ops.less_equal(x, y) check_variable_and_dtype( @@ -359,6 +372,9 @@ def less_than(x, y, name=None): print(result1) # result1 = [False True False] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_less_than(x, y) + return _C_ops.less_than(x, y) check_variable_and_dtype( @@ -402,6 +418,9 @@ def not_equal(x, y, name=None): print(result1) # result1 = [False True True] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_not_equal(x, y) + return _C_ops.not_equal(x, y) check_variable_and_dtype( @@ -443,7 +462,7 @@ def is_tensor(x): print(check) #False """ - return isinstance(x, Tensor) + return isinstance(x, (Tensor, paddle.fluid.core.eager.Tensor)) def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True): diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 32ccecbc6d9f0282b86f100e1b910667fab41cb2..e530bfd8536a42ef84bde0a9fb8c15fe47241fc5 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -16,7 +16,7 @@ from __future__ import print_function from collections import Counter from ..static import Variable, device_guard -from ..framework import core +from ..framework import core, _in_eager_mode from ..fluid.layer_helper import LayerHelper from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype @@ -263,6 +263,9 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None): setattr(core.VarBase, 'fill_diagonal_tensor', fill_diagonal_tensor) +if core._in_eager_mode(): + setattr(core.eager.Tensor, 'fill_diagonal_tensor', fill_diagonal_tensor) + @dygraph_only def tolist(x): @@ -889,12 +892,20 @@ def stack(x, axis=0, name=None): x1 = paddle.to_tensor([[1.0, 2.0]]) x2 = paddle.to_tensor([[3.0, 4.0]]) x3 = paddle.to_tensor([[5.0, 6.0]]) + out = paddle.stack([x1, x2, x3], axis=0) print(out.shape) # [3, 1, 2] print(out) # [[[1., 2.]], # [[3., 4.]], # [[5., 6.]]] + + out = paddle.stack([x1, x2, x3], axis=-2) + print(out.shape) # [1, 3, 2] + print(out) + # [[[1., 2.], + # [3., 4.], + # [5., 6.]]] """ return layers.stack(x, axis, name) @@ -1567,6 +1578,8 @@ def scatter(x, index, updates, overwrite=True, name=None): # [1., 1.]] """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_scatter(x, index, updates, overwrite) return _C_ops.scatter(x, index, updates, 'overwrite', overwrite) check_variable_and_dtype( diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 9a0139105651b53781f9c76189abb1b7d8ddefe9..ced2113733c02ad924d7a7e0be5b357a35447197 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1274,6 +1274,8 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None): if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_addmm( input, x, y, alpha, beta) out = _C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta) return out @@ -1333,7 +1335,7 @@ def renorm(x, p, axis, max_norm): raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape))) axis = axis + len(input_shape) if paddle.in_dynamic_mode(): - out = core.ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm) + out = _C_ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm) return out inputs = {'X': x} @@ -3266,6 +3268,8 @@ def atan2(x, y, name=None): """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_atan2( x, y) return _C_ops.atan2(x, y) else: check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2') diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 0ba47d79050ce2bc9ba4842681825f47f059c5df..fe2e979f9845c43ceb09f91bb7f3bc98059ad724 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -17,7 +17,7 @@ import paddle from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import layers -from ..framework import core +from ..framework import core, _in_eager_mode from paddle.common_ops_import import convert_np_dtype_to_dtype_ from paddle.common_ops_import import Variable from paddle.common_ops_import import VarDesc @@ -621,6 +621,9 @@ def where(condition, x=None, y=None, name=None): broadcast_condition = paddle.cast(broadcast_condition, 'bool') if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_where(broadcast_condition, broadcast_x, + broadcast_y) return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y) else: helper = LayerHelper("where", **locals()) @@ -712,6 +715,8 @@ def index_sample(x, index): """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_index_sample(x, index) return _C_ops.index_sample(x, index) helper = LayerHelper("index_sample", **locals()) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 70dea65b7699b413f0dc5fc8d68599229beb3078..33740dccdfc04740a1858d9f47d7522ff916ba49 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -5,6 +5,7 @@ func : ElementwiseInferMeta kernel : func : add + # backward : add_grad - api : cast args : (Tensor x, DataType out_dtype) @@ -274,3 +275,265 @@ kernel : func : diagonal backward : diagonal_grad + + +- api : gumbel_softmax + args : (Tensor x, float temperature, bool hard, int axis) + output : Tensor + infer_meta : + func : GumbelSoftmaxInferMeta + kernel : + func : gumbel_softmax + # backward : gumbel_softmax_grad + +- api : diag + args : (Tensor x, int offset, float padding_value) + output : Tensor + infer_meta : + func : DiagInferMeta + kernel : + func : diag + +# - api : pixel_shuffle +# args : (Tensor x, int upscale_factor, const std::string& data_format) +# output : Tensor +# infer_meta : +# func : PixelShuffleInferMeta +# kernel : +# func : pixel_shuffle + +- api : transpose + args : (Tensor x, int[] axis) + output : Tensor + infer_meta : + func : TransposeInferMeta + kernel : + func : transpose + backward : transpose_grad + +- api : lerp + args : (Tensor x, Tensor y, Tensor weight) + output : Tensor + infer_meta : + func : LerpInferMeta + kernel : + func : lerp + # backward : lerp_grad + +- api : scatter + args : (Tensor x, Tensor index, Tensor updates, bool overwrite) + output : Tensor + infer_meta : + func : ScatterInferMeta + dtype : x + kernel : + func : scatter + backward : scatter_grad + + +- api : scatter_nd_add + args : (Tensor x, Tensor index, Tensor updates) + output : Tensor + infer_meta : + func : ScatterNdAddInferMeta + dtype : x + kernel : + func : scatter_nd_add + backward : scatter_nd_add_grad + + +- api : addmm + args : (Tensor input, Tensor x, Tensor y, float alpha, float beta) + output : Tensor + infer_meta : + func : AddmmInferMeta + kernel : + func : addmm + backward : addmm_grad + + +- api : adadelta + args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, float rho, float epsilon) + output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out) + infer_meta : + func : AdadeltaInferMeta + kernel : + func : adadelta + +- api : adamax + args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon) + output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out) + infer_meta : + func : AdamaxInferMeta + kernel : + func : adamax + + + +- api : where + args : (Tensor condition, Tensor x, Tensor y) + output : Tensor + infer_meta : + func : WhereInferMeta + kernel : + func : where + backward : where_grad + + +# BilinearTensorProductInferMeta + +# BroadcastTensorsInferMeta + +- api : less_than + args : (Tensor x, Tensor y, int axis = -1) + output : Tensor + infer_meta : + func : CompareInferMeta + kernel : + func : less_than + +- api : less_equal + args : (Tensor x, Tensor y, int axis = -1) + output : Tensor + infer_meta : + func : CompareInferMeta + kernel : + func : less_equal + +- api : greater + args : (Tensor x, Tensor y, int axis = -1) + output : Tensor + infer_meta : + func : CompareInferMeta + kernel : + func : greater + +- api : greater_equal + args : (Tensor x, Tensor y, int axis = -1) + output : Tensor + infer_meta : + func : CompareInferMeta + kernel : + func : greater_equal + +- api : equal + args : (Tensor x, Tensor y, int axis = -1) + output : Tensor + infer_meta : + func : CompareInferMeta + kernel : + func : equal + +- api : not_equal + args : (Tensor x, Tensor y, int axis = -1) + output : Tensor + infer_meta : + func : CompareInferMeta + kernel : + func : not_equal + +# - api : equal_all +# args : (Tensor x, Tensor y) +# output : Tensor +# infer_meta : +# func : CompareAllInferMeta +# kernel : +# func : equal_all + + +- api : huber_loss + args : (Tensor input, Tensor label, float delta) + output : Tensor(out), Tensor(residual) + infer_meta : + func : HuberLossInferMeta + kernel : + func : huber_loss + # backward : huber_loss_grad + +- api : triangular_solve + args : (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) + output : Tensor + infer_meta : + func : TriangularSolveInferMeta + kernel : + func : triangular_solve + # backward : triangular_solve_grad + + +- api : index_sample + args : (Tensor x, Tensor index) + output : Tensor + infer_meta : + func : IndexSampleInferMeta + kernel : + func : index_sample + data_type : x + backward : index_sample_grad + + +- api : cross + args : (Tensor x, Tensor y, int axis = 9) + output : Tensor + infer_meta : + func : CrossInferMeta + kernel : + func : cross + backward : cross_grad + + +- api : atan2 + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : Atan2InferMeta + kernel : + func : atan2 + backward : atan2_grad + + +- api : bce_loss + args : (Tensor input, Tensor label) + output : Tensor + infer_meta : + func : BCELossInferMeta + kernel : + func : bce_loss + backward : bce_loss_grad + + +- api : dist + args : (Tensor x, Tensor y, float p) + output : Tensor + infer_meta : + func : DistInferMeta + kernel : + func : dist + # backward : dist_grad + + +- api : gather_nd + args : (Tensor x, Tensor index) + output : Tensor + infer_meta : + func : GatherNdInferMeta + kernel : + func : gather_nd + data_type : x + backward : gather_nd_grad + +- api : gather_tree + args : (Tensor ids, Tensor parents) + output : Tensor + infer_meta : + func : GatherTreeMeta + kernel : + func : gather_tree + +- api : mv + args : (Tensor x, Tensor vec) + output : Tensor + infer_meta : + func : MvInferMeta + kernel : + func : mv + backward : mv_grad diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 98a3606952bbb13d3b20c55427b9747f1a4a5624..07baa9b51de391721bc5c33745c77f7f56c4f974 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -147,6 +147,7 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/infermeta/ternary.h" #include "paddle/phi/kernels/declarations.h" #include "paddle/fluid/platform/profiler/event_tracing.h" diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index c69bbf35b97263fb2c153839ac0105427a87e118..a0bf363ac9bdb99c21715ed4845bd7d4615cb23a 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -25,6 +25,17 @@ output : Tensor(x_grad) invoke : scale(out_grad, scale, bias, bias_after_scale) + +- backward_api : add_grad + forward : add (Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : add_grad + - backward_api : digamma_grad forward : digamma (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -90,3 +101,168 @@ # func : MatmulTripleGradInferMeta # kernel : # func : matmul_triple_grad + +# - backward_api : gumbel_softmax_grad +# forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out) +# args : (Tensor out, Tensor out_grad, int axis) +# output : Tensor(x_grad) +# infer_meta : +# func : GumbelSoftmaxGradInferMeta +# param : [out, out_grad, axis] +# kernel : +# func : gumbel_softmax_grad + + +- backward_api : transpose_grad + forward : transpose (Tensor x, int[] axis) -> Tensor(out) + args : (Tensor out_grad, int[] axis) + output : Tensor(x_grad) + infer_meta : + func : TransposeGradInferMeta + param : [out_grad, axis] + kernel : + func : transpose_grad + +# - backward_api : lerp_grad +# forward : transpose (Tensor x, Tensor y, Tensor weight) -> Tensor(out) +# args : (Tensor x, Tensor y, Tensor weight, Tensor out, Tensor out_grad) +# output : Tensor(x_grad), Tensor(y_grad) +# infer_meta : +# func : GeneralBinaryGradInferMeta +# param : [x, y] +# kernel : +# func : lerp_grad + + +- backward_api : scatter_grad + forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out) + args : (Tensor index, Tensor updates, Tensor out_grad, bool overwrite) + output : Tensor(x_grad), Tensor(updates_grad) + infer_meta : + func : ScatterGradInferMeta + param : [index, updates, out_grad, overwrite] + kernel : + func : scatter_grad + +- backward_api : scatter_nd_add_grad + forward : scatter (Tensor x, Tensor index, Tensor updates) -> Tensor(out) + args : (Tensor index, Tensor updates, Tensor out_grad) + output : Tensor(x_grad), Tensor(updates_grad) + infer_meta : + func : ScatterNdAddGradInferMeta + param : [index, updates, out_grad] + kernel : + func : scatter_nd_grad + +- backward_api : addmm_grad + forward : scatter (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out) + args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta) + output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [input, x, y] + kernel : + func : addmm_grad + +- backward_api : where_grad + forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out) + args : (Tensor condition, Tensor x, Tensor y, Tensor out_grad) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : where_grad + +# - backward_api : huber_loss_grad +# forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual) +# args : (Tensor residual, Tensor out_grad, float delta) +# output : Tensor(input_grad), Tensor(label_grad) +# infer_meta : +# func : GeneralBinaryGradInferMeta +# param : [x, y] +# kernel : +# func : where_grad + +# - backward_api : triangular_solve_grad +# forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out) +# args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular) +# output : Tensor(x_grad), Tensor(y_grad) +# infer_meta : +# func : GeneralBinaryGradInferMeta +# param : [x, y] +# kernel : +# func : triangular_solve_grad + +- backward_api : index_sample_grad + forward : index_sample (Tensor x, Tensor index) -> Tensor(out) + args : (Tensor x, Tensor index, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : index_sample_grad + +- backward_api : cross_grad + forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad, int axis) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : cross_grad + +- backward_api : atan2_grad + forward : cross (Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : atan2_grad + +- backward_api : bce_loss_grad + forward : bce_loss (Tensor input, Tensor label) -> Tensor(out) + args : (Tensor input, Tensor label, Tensor out_grad) + output : Tensor(input_grad) + infer_meta : + func : UnchangedInferMeta + param : [input] + kernel : + func : bce_loss_grad + + +# - backward_api : dist_grad +# forward : dist (Tensor x, Tensor y, float p) -> Tensor(out) +# args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p) +# output : Tensor(x_grad), Tensor(y_grad) +# infer_meta : +# func : GeneralBinaryGradInferMeta +# param : [x, y] +# kernel : +# func : dist_grad + + + +- backward_api : gather_nd_grad + forward : gather_nd (Tensor x, Tensor index) -> Tensor(out) + args : (Tensor x, Tensor index, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : gather_nd_grad + +- backward_api : mv_grad + forward : mv (Tensor x, Tensor vec) -> Tensor(out) + args : (Tensor x, Tensor vec, Tensor out_grad) + output : Tensor(x_grad), Tensor(vec_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, vec] + kernel : + func : mv_grad diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py index 0d018f8e3f64fc2f9a89e78d81d3a392e799b441..1cb3c33da721959148bf320d7e94f50fac1ff52a 100644 --- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py +++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py @@ -98,6 +98,7 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/infermeta/ternary.h" """ diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py index 1a3dbd68066a72384589ac24579e0540b5484a6e..9fd200bf0344d58d6a2705d768afffc7ce92dcc2 100644 --- a/python/paddle/vision/transforms/transforms.py +++ b/python/paddle/vision/transforms/transforms.py @@ -327,12 +327,17 @@ class ToTensor(BaseTransform): import paddle.vision.transforms as T import paddle.vision.transforms.functional as F - fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8)) + fake_img = Image.fromarray((np.random.rand(4, 5, 3) * 255.).astype(np.uint8)) transform = T.ToTensor() tensor = transform(fake_img) - + + print(tensor.shape) + # [3, 4, 5] + + print(tensor.dtype) + # paddle.float32 """ def __init__(self, data_format='CHW', keys=None): diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py index 0d633cfc60a9b6cddc669da0dbc87667f8211714..7149c8d022afd60217f0ba7b0e0d642acf2afa01 100644 --- a/tools/infrt/fake_models/multi_fc.py +++ b/tools/infrt/fake_models/multi_fc.py @@ -52,4 +52,7 @@ loss = exe = fluid.Executor(cpu) exe.run(fluid.default_startup_program()) fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe) +fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe, None, + "fc.pdmodel", "fc.pdiparams") + print('output name', fc_out.name) diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh index 6b2586d40819b9e25eef823dff59687114664197..febfe5d04762a43da0710b34e21252ffdf4611ea 100644 --- a/tools/infrt/get_phi_kernel_function.sh +++ b/tools/infrt/get_phi_kernel_function.sh @@ -49,7 +49,7 @@ all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_k for ir in $all_ir_name do attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ - | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ @@ -62,7 +62,7 @@ all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_k for ir in $all_ir_name do attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ - | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index 85ad585cdefa9cbb4ac8d029e699af4d5ffaeaf7..8b752f928719bcc7ebef4792c29af02261dbd551 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -133,11 +133,11 @@ namespace kernel { def gen_context(val): if val == "CPU": - return "phi::CPUContext", "phi_cpu" + return "::phi::CPUContext", "phi_cpu" elif val == "GPU": - return "phi::GPUContext", "phi_gpu" + return "::phi::GPUContext", "phi_gpu" # elif val == "XPU": - # return "phi::XPUContext", "phi_xpu" + # return "::phi::XPUContext", "phi_xpu" else: # raise Exception(f"Unknown context type {val}") return "", "" @@ -157,12 +157,12 @@ def gen_kernel_func(val, ctx_name, dtype_name): ed = val.index('>') func_name = val[:st] template_name = val[st + 1:ed] - if 'phi::' in template_name: - return "&phi::" + val + if '::phi::' in template_name: + return "&::phi::" + val else: - return "&phi::" + func_name + "" + return "&::phi::" + func_name + "<::phi::" + template_name + ">" else: - return "&phi::" + val + "<" + dtype_name + ", " + ctx_name + ">" + return "&::phi::" + val + "<" + dtype_name + ", " + ctx_name + ">" def gen_dtype(vals: List[str]): @@ -227,7 +227,7 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]): return "" item[2] = gen_layout(item[2]) ir_dtypes, origin_dtypes = gen_dtype(item[4:-1]) - infer_shape_func = "&phi::" + item[-1] + infer_shape_func = "&::phi::" + item[-1] res = ""